diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0fc50dc1a87b6..b07843523a15b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6443,25 +6443,36 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } case ISD::SHL: case ISD::SRL: { - // We can only decode 'whole byte' bit shifts as shuffles. - std::optional Amt = DAG.getValidShiftAmount(N, DemandedElts); - if (!Amt || (*Amt % 8) != 0) + APInt UndefElts; + SmallVector EltBits; + if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt, + UndefElts, EltBits, + /*AllowWholeUndefs*/ true, + /*AllowPartialUndefs*/ false)) return false; - uint64_t ByteShift = *Amt / 8; - Ops.push_back(N.getOperand(0)); + // We can only decode 'whole byte' bit shifts as shuffles. + for (unsigned I = 0; I != NumElts; ++I) + if (DemandedElts[I] && !UndefElts[I] && + (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt))) + return false; - // Clear mask to all zeros and insert the shifted byte indices. - Mask.append(NumSizeInBytes, SM_SentinelZero); + Mask.append(NumSizeInBytes, SM_SentinelUndef); + Ops.push_back(N.getOperand(0)); - if (ISD::SHL == Opcode) { - for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) - for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) - Mask[i + j] = i + j - ByteShift; - } else { - for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) - for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) - Mask[i + j - ByteShift] = i + j; + for (unsigned I = 0; I != NumElts; ++I) { + if (!DemandedElts[I] || UndefElts[I]) + continue; + unsigned ByteShift = EltBits[I].getZExtValue() / 8; + unsigned Lo = I * NumBytesPerElt; + unsigned Hi = Lo + NumBytesPerElt; + // Clear mask to all zeros and insert the shifted byte indices. + std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero); + if (ISD::SHL == Opcode) + std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo); + else + std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift, + Lo + ByteShift); } return true; } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 469ef262a05f6..12d494c32b656 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -769,22 +769,10 @@ define <16 x i8> @combine_lshr_pshufb(<4 x i32> %a0) { ; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_lshr_pshufb: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_lshr_pshufb: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_lshr_pshufb: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15] -; AVX512F-NEXT: retq +; AVX-LABEL: combine_lshr_pshufb: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero +; AVX-NEXT: retq %shr = lshr <4 x i32> %a0, %bc = bitcast <4 x i32> %shr to <16 x i8> %shuffle = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> @@ -817,14 +805,12 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) { ; ; AVX2-LABEL: combine_shl_pshufb: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_shl_pshufb: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13] ; AVX512F-NEXT: retq %shr = shl <4 x i32> %a0, %bc = bitcast <4 x i32> %shr to <16 x i8>