diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 50df19b3e6e47..1b0bf6823e390 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59323,7 +59323,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case X86ISD::ANDNP: // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2. if (!IsSplat && (VT.is256BitVector() || - (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { + (VT.is512BitVector() && Subtarget.useAVX512Regs()) || + (EltSizeInBits == 1 && TLI.isTypeLegal(VT)))) { // Don't concatenate root AVX1 NOT patterns. // TODO: Allow NOT folding if Concat0 succeeds. if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() && @@ -59333,7 +59334,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; SDValue Concat0 = CombineSubOperand(VT, Ops, 0); SDValue Concat1 = CombineSubOperand(VT, Ops, 1); - if (Concat0 || Concat1 || Subtarget.useAVX512Regs()) + if (Concat0 || Concat1 || + (EltSizeInBits != 1 && Subtarget.useAVX512Regs())) return DAG.getNode(Opcode, DL, VT, Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0), Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1)); @@ -59727,6 +59729,14 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, } } + // Attempt to merge logic ops if the type is legal. + if (TLI.isTypeLegal(VT) && all_of(Ops, [](SDValue Op) { + return ISD::isBitwiseLogicOp(Op.getOpcode()); + })) + if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, + DAG, Subtarget)) + return R; + // Don't do anything else for i1 vectors. return SDValue(); } diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll index 8b1e69a97d545..5d216a218cf9b 100644 --- a/llvm/test/CodeGen/X86/kmov.ll +++ b/llvm/test/CodeGen/X86/kmov.ll @@ -477,16 +477,13 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) { ; X64-AVX512-LABEL: invert_i64_mask_extract_32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: kmovq %rdi, %k0 -; X64-AVX512-NEXT: knotb %k0, %k1 -; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 -; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k1 +; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k1 ; X64-AVX512-NEXT: kshiftrd $16, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 -; X64-AVX512-NEXT: knotb %k0, %k0 ; X64-AVX512-NEXT: kunpckbw %k2, %k0, %k0 ; X64-AVX512-NEXT: kunpckwd %k1, %k0, %k0 +; X64-AVX512-NEXT: knotd %k0, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0 ; X64-AVX512-NEXT: retq ; @@ -495,18 +492,16 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) { ; X64-KNL-NEXT: movl %edi, %eax ; X64-KNL-NEXT: shrl $16, %eax ; X64-KNL-NEXT: kmovw %eax, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %edi, %eax ; X64-KNL-NEXT: shrl $24, %eax ; X64-KNL-NEXT: kmovw %eax, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 -; X64-KNL-NEXT: kunpckbw %k0, %k1, %k1 +; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0 +; X64-KNL-NEXT: knotw %k0, %k1 ; X64-KNL-NEXT: kmovw %edi, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: shrl $8, %edi ; X64-KNL-NEXT: kmovw %edi, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 -; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2 +; X64-KNL-NEXT: kunpckbw %k0, %k2, %k0 +; X64-KNL-NEXT: knotw %k0, %k2 ; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0 ; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -586,27 +581,20 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) { ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: kmovq %rdi, %k0 ; X64-AVX512-NEXT: kshiftrq $32, %k0, %k1 -; X64-AVX512-NEXT: knotb %k1, %k1 ; X64-AVX512-NEXT: kshiftrq $40, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1 ; X64-AVX512-NEXT: kshiftrq $48, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kshiftrq $56, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 ; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2 ; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1 -; X64-AVX512-NEXT: knotb %k0, %k2 -; X64-AVX512-NEXT: kshiftrd $8, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 -; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2 +; X64-AVX512-NEXT: kunpckbw %k0, %k2, %k2 ; X64-AVX512-NEXT: kshiftrd $16, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 ; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 -; X64-AVX512-NEXT: knotb %k0, %k0 ; X64-AVX512-NEXT: kunpckbw %k3, %k0, %k0 ; X64-AVX512-NEXT: kunpckwd %k2, %k0, %k0 ; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0 +; X64-AVX512-NEXT: knotq %k0, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0 ; X64-AVX512-NEXT: retq ; @@ -614,38 +602,34 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) { ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: movq %rdi, %rax ; X64-KNL-NEXT: kmovw %esi, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $8, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0 +; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $16, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $24, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: kunpckbw %k1, %k2, %k1 +; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $32, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $40, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k3 -; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: kunpckbw %k2, %k3, %k2 +; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $48, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k3 -; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: shrq $56, %rsi ; X64-KNL-NEXT: kmovw %esi, %k4 -; X64-KNL-NEXT: knotw %k4, %k4 ; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3 +; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: kmovw %k3, 6(%rdi) ; X64-KNL-NEXT: kmovw %k2, 4(%rdi) ; X64-KNL-NEXT: kmovw %k1, 2(%rdi)