diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 543196adf29e4..ff42137a098d3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58465,9 +58465,13 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumOps * SrcVT.getVectorNumElements()); - return DAG.getNode(Op0.getOpcode(), DL, VT, - ConcatSubOperand(SrcVT, Ops, 0), - ConcatSubOperand(SrcVT, Ops, 1)); + SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0); + SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1); + if (Concat0 || Concat1) + return DAG.getNode( + Op0.getOpcode(), DL, VT, + Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0), + Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1)); } break; case X86ISD::PALIGNR: diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll index dc60bfdca53b2..30e61a68bb22f 100644 --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -245,21 +245,12 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun } define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { -; AVX512F-LABEL: concat_packsswd_int_2x256: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: concat_packsswd_int_2x256: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: concat_packsswd_int_2x256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpackssdw %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %lo = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) %hi = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3) %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> @@ -268,21 +259,12 @@ define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) define <32 x i16> @concat_packuswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { -; AVX512F-LABEL: concat_packuswd_int_2x256: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: concat_packuswd_int_2x256: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: concat_packuswd_int_2x256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpackusdw %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %lo = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) %hi = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3) %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 4bf8663b9ee09..f09821bb189aa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -817,15 +817,13 @@ define <32 x i8> @concat_alignr_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1 ret <32 x i8> %res } -; TODO: Not beneficial to concatenate both inputs just to create a 256-bit packss -define <32 x i8> @concat_packsr_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind { -; CHECK-LABEL: concat_packsr_unnecessary: +; Not beneficial to concatenate both inputs just to create a 256-bit packss +define <32 x i8> @concat_packss_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind { +; CHECK-LABEL: concat_packss_unnecessary: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %lo = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) %hi = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a2)