diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 386d56dcda9de..696bb14292dd0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41629,23 +41629,28 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: + case X86ISD::VPERMV: case X86ISD::VPERMI: case X86ISD::VPERMILPI: { - if (N.getOperand(0).getValueType() == ShuffleVT && - N->isOnlyUserOf(N.getOperand(0).getNode())) { - SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); + unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0; + if (N.getOperand(SrcIdx).getValueType() == ShuffleVT && + N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) { + SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx)); unsigned SrcOpcode = N0.getOpcode(); EVT OpVT = N0.getValueType(); if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); - bool FoldShuf = Opc != X86ISD::VPERMI; + bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV; if (IsMergeableWithShuffle(Op00, FoldShuf) || IsMergeableWithShuffle(Op01, FoldShuf)) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op01 = DAG.getBitcast(ShuffleVT, Op01); - if (N.getNumOperands() == 2) { + if (Opc == X86ISD::VPERMV) { + LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00); + RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01); + } else if (N.getNumOperands() == 2) { LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)); RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1)); } else { @@ -41661,11 +41666,13 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) && OpVT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { - SDValue Op00 = DAG.getBitcast(ShuffleVT, N0.getOperand(0)); - SDValue Res = - N.getNumOperands() == 2 - ? DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)) - : DAG.getNode(Opc, DL, ShuffleVT, Op00); + SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0)); + if (Opc == X86ISD::VPERMV) + Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res); + else if (N.getNumOperands() == 2) + Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1)); + else + Res = DAG.getNode(Opc, DL, ShuffleVT, Res); Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res); return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res)); } diff --git a/llvm/test/CodeGen/X86/vector-partial-undef.ll b/llvm/test/CodeGen/X86/vector-partial-undef.ll index fd41fd53e3be1..4753dba2d468f 100644 --- a/llvm/test/CodeGen/X86/vector-partial-undef.ll +++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll @@ -151,9 +151,9 @@ define <8 x i32> @xor_undef_elts_alt(<4 x i32> %x) { ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7] ; AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> %bogus_bo = xor <8 x i32> %extend, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 07c770abc65d6..05b0a7c10b4e1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2469,10 +2469,10 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { ; ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: