diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9048d1d83f187..9854bbe9f43a4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15952,6 +15952,54 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, DAG.getIntPtrConstant(0, DL)); } +// Match truncation of both 512-bit operands and concat results together. +// TODO: Similar to lowerShuffleAsVTRUNC - merge or share matching code? +static SDValue lowerShuffleAsVTRUNCAndConcat( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(VT.is512BitVector() && VT.getScalarSizeInBits() < 64 && + "Unexpected type!"); + if (!Subtarget.hasAVX512()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned DstSizeInBits = VT.getScalarSizeInBits(); + unsigned SrcSizeInBits = DstSizeInBits * 2; + + // TODO: Support non-BWI VPMOVWB truncations? + if (SrcSizeInBits < 32 && !Subtarget.hasBWI()) + return SDValue(); + + // Match shuffle + // TODO: Handle general Scale factors with undef/zero upper elements. + for (unsigned Offset = 0; Offset != 2; ++Offset) { + if (!isSequentialOrUndefInRange(Mask, 0, NumElts, Offset, 2)) + continue; + + MVT DstVT = MVT::getVectorVT(VT.getScalarType(), NumElts / 2); + MVT SrcSVT = MVT::getIntegerVT(SrcSizeInBits); + MVT SrcVT = MVT::getVectorVT(SrcSVT, NumElts / 2); + + V1 = DAG.getBitcast(SrcVT, V1); + V2 = DAG.getBitcast(SrcVT, V2); + + if (Offset) { + V1 = DAG.getNode( + X86ISD::VSRLI, DL, SrcVT, V1, + DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8)); + V2 = DAG.getNode( + X86ISD::VSRLI, DL, SrcVT, V2, + DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8)); + } + + V1 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V1); + V2 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V2); + return concatSubVectors(V1, V2, DAG, DL); + } + + return SDValue(); +} + // a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2 // b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2 // => @@ -17312,6 +17360,10 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return PSHUFB; + if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat( + DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return Trunc; + return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG); } @@ -17367,6 +17419,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return PSHUFB; + if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat( + DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return Trunc; + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -52615,6 +52671,17 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return splitVectorStore(St, DAG); } + // Split a concatenation of truncations to fold to truncating stores. + if (VT.is512BitVector() && Subtarget.hasAVX512() && StVT == VT && + StoredVal.hasOneUse()) { + SmallVector Ops; + if (collectConcatOps(StoredVal.getNode(), Ops, DAG) && + all_of(Ops, [&](SDValue Op) { + return Op.getOpcode() == ISD::TRUNCATE && Op.hasOneUse(); + })) + return splitVectorStore(St, DAG); + } + // Split under-aligned vector non-temporal stores. if (St->isNonTemporal() && StVT == VT && St->getAlign().value() < VT.getStoreSize()) { diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 22b5246443fa8..e8a6fd66ba360 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -2638,14 +2638,14 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero +; AVX512F-NEXT: vpmovdb %zmm6, (%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: smulo_v64i8: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 4d7d2573183e0..6a15fc2f12dba 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2301,14 +2301,14 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero +; AVX512F-NEXT: vpmovdb %zmm6, (%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: umulo_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index b3d8d05f69947..02a30f6fad804 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -4,14 +4,14 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-VL -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512-FCP -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-FCP -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW-FCP -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ-BW-FCP +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 ; These patterns are produced by LoopVectorizer for interleaved loads. @@ -572,109 +572,18 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; -; AVX512-VL-LABEL: load_i16_stride2_vf32: -; AVX512-VL: # %bb.0: -; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm2, (%rdx) -; AVX512-VL-NEXT: vzeroupper -; AVX512-VL-NEXT: retq -; -; AVX512-FCP-LABEL: load_i16_stride2_vf32: -; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm2, (%rdx) -; AVX512-FCP-NEXT: vzeroupper -; AVX512-FCP-NEXT: retq -; -; AVX512DQ-LABEL: load_i16_stride2_vf32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm2, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf32: -; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vzeroupper -; AVX512DQ-FCP-NEXT: retq -; -; AVX512BW-LABEL: load_i16_stride2_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BW-FCP-LABEL: load_i16_stride2_vf32: -; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-FCP-NEXT: vzeroupper -; AVX512BW-FCP-NEXT: retq -; -; AVX512DQ-BW-LABEL: load_i16_stride2_vf32: -; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-NEXT: vzeroupper -; AVX512DQ-BW-NEXT: retq -; -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf32: -; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vzeroupper -; AVX512DQ-BW-FCP-NEXT: retq +; AVX512-LABEL: load_i16_stride2_vf32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm2 +; AVX512-NEXT: vpsrld $16, %zmm1, %zmm3 +; AVX512-NEXT: vpmovdw %zmm1, 32(%rsi) +; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vpmovdw %zmm3, 32(%rdx) +; AVX512-NEXT: vpmovdw %zmm2, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <64 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <32 x i32> %strided.vec1 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <32 x i32> @@ -1099,173 +1008,27 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; -; AVX512-VL-LABEL: load_i16_stride2_vf64: -; AVX512-VL: # %bb.0: -; AVX512-VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-VL-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-VL-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-VL-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-VL-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rdx) -; AVX512-VL-NEXT: vzeroupper -; AVX512-VL-NEXT: retq -; -; AVX512-FCP-LABEL: load_i16_stride2_vf64: -; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rdx) -; AVX512-FCP-NEXT: vzeroupper -; AVX512-FCP-NEXT: retq -; -; AVX512DQ-LABEL: load_i16_stride2_vf64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm2, 64(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQ-FCP-LABEL: load_i16_stride2_vf64: -; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rdx) -; AVX512DQ-FCP-NEXT: vzeroupper -; AVX512DQ-FCP-NEXT: retq -; -; AVX512BW-LABEL: load_i16_stride2_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BW-FCP-LABEL: load_i16_stride2_vf64: -; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FCP-NEXT: vzeroupper -; AVX512BW-FCP-NEXT: retq -; -; AVX512DQ-BW-LABEL: load_i16_stride2_vf64: -; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-NEXT: vzeroupper -; AVX512DQ-BW-NEXT: retq -; -; AVX512DQ-BW-FCP-LABEL: load_i16_stride2_vf64: -; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vzeroupper -; AVX512DQ-BW-FCP-NEXT: retq +; AVX512-LABEL: load_i16_stride2_vf64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vpmovdw %zmm1, %ymm4 +; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm5 +; AVX512-NEXT: vpsrld $16, %zmm3, %zmm6 +; AVX512-NEXT: vpsrld $16, %zmm2, %zmm7 +; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX512-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512-NEXT: vpmovdw %zmm3, 96(%rsi) +; AVX512-NEXT: vpmovdw %zmm7, 64(%rdx) +; AVX512-NEXT: vpmovdw %zmm6, 96(%rdx) +; AVX512-NEXT: vpmovdw %zmm5, (%rdx) +; AVX512-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <128 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <64 x i32> %strided.vec1 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <64 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 9f69a3cf44189..8a4d518b95444 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -1136,15 +1136,12 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovwb %zmm1, 32(%rsi) +; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-NEXT: vpmovwb %zmm3, 32(%rdx) +; AVX512BW-NEXT: vpmovwb %zmm2, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1152,15 +1149,12 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpsrlw $8, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpsrlw $8, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovwb %zmm1, 32(%rsi) +; AVX512BW-FCP-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpmovwb %zmm3, 32(%rdx) +; AVX512BW-FCP-NEXT: vpmovwb %zmm2, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1168,15 +1162,12 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpsrlw $8, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpmovwb %zmm1, 32(%rsi) +; AVX512DQ-BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpmovwb %zmm3, 32(%rdx) +; AVX512DQ-BW-NEXT: vpmovwb %zmm2, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1184,15 +1175,12 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpsrlw $8, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpsrlw $8, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovwb %zmm1, 32(%rsi) +; AVX512DQ-BW-FCP-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpmovwb %zmm3, 32(%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovwb %zmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 469c087ec9c08..d8f10b675bfab 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -1484,9 +1484,9 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] -; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: @@ -1505,8 +1505,11 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127] -; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <32 x i16> %a0, %2 = lshr <32 x i16> %a1,