@@ -58493,14 +58493,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5849358493 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
5849458494 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
5849558495 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58496- // Only concat of subvector high halves which vperm2x128 is best at.
58496+ // Only concat of subvector high halves which vperm2x128 is best at or if
58497+ // it should fold into a subvector broadcast.
5849758498 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58498- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58499- SrcIdx1 == (NumSrcElts1 / 2)) {
58500- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58501- DAG.getBitcast(VT, Src0.getOperand(0)),
58502- DAG.getBitcast(VT, Src1.getOperand(0)),
58503- DAG.getTargetConstant(0x31, DL, MVT::i8));
58499+ SrcVT1.is256BitVector()) {
58500+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58501+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58502+ "Bad subvector index");
58503+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58504+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58505+ unsigned Index = 0;
58506+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58507+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58508+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58509+ DAG.getBitcast(VT, Src0.getOperand(0)),
58510+ DAG.getBitcast(VT, Src1.getOperand(0)),
58511+ DAG.getTargetConstant(Index, DL, MVT::i8));
58512+ }
5850458513 }
5850558514 // Widen extract_subvector
5850658515 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -59313,6 +59322,45 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5931359322 return DAG.getBitcast(VT, Res);
5931459323 }
5931559324
59325+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59326+ if (!IsSplat && NumOps == 2 && VT == MVT::v4f64 &&
59327+ all_of(Ops, [](SDValue Op) {
59328+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59329+ Op.getOpcode() == X86ISD::SHUFP ||
59330+ Op.getOpcode() == X86ISD::VPERMILPI ||
59331+ Op.getOpcode() == X86ISD::BLENDI ||
59332+ Op.getOpcode() == X86ISD::UNPCKL ||
59333+ Op.getOpcode() == X86ISD::UNPCKH);
59334+ })) {
59335+ SmallVector<SDValue, 2> SrcOps0, SrcOps1;
59336+ SmallVector<int, 8> SrcMask0, SrcMask1;
59337+ if (getTargetShuffleMask(Ops[0], /*AllowSentinelZero=*/false, SrcOps0,
59338+ SrcMask0) &&
59339+ getTargetShuffleMask(Ops[1], /*AllowSentinelZero=*/false, SrcOps1,
59340+ SrcMask1)) {
59341+ assert(SrcMask0.size() == 2 && SrcMask1.size() == 2 && "Bad shuffles");
59342+ SDValue LHS[] = {SrcOps0[SrcMask0[0] / 2], SrcOps1[SrcMask1[0] / 2]};
59343+ SDValue RHS[] = {SrcOps0[SrcMask0[1] / 2], SrcOps1[SrcMask1[1] / 2]};
59344+ SDValue Concat0 =
59345+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59346+ SDValue Concat1 =
59347+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59348+ if (Concat0 || Concat1) {
59349+ unsigned SHUFPDMask = 0;
59350+ SHUFPDMask |= (SrcMask0[0] & 1) << 0;
59351+ SHUFPDMask |= (SrcMask0[1] & 1) << 1;
59352+ SHUFPDMask |= (SrcMask1[0] & 1) << 2;
59353+ SHUFPDMask |= (SrcMask1[1] & 1) << 3;
59354+ Concat0 =
59355+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59356+ Concat1 =
59357+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59358+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59359+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59360+ }
59361+ }
59362+ }
59363+
5931659364 return SDValue();
5931759365}
5931859366
0 commit comments