@@ -58503,14 +58503,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5850358503 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
5850458504 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
5850558505 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58506- // Only concat of subvector high halves which vperm2x128 is best at.
58506+ // Only concat of subvector high halves which vperm2x128 is best at or if
58507+ // it should fold into a subvector broadcast.
5850758508 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58508- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58509- SrcIdx1 == (NumSrcElts1 / 2)) {
58510- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58511- DAG.getBitcast(VT, Src0.getOperand(0)),
58512- DAG.getBitcast(VT, Src1.getOperand(0)),
58513- DAG.getTargetConstant(0x31, DL, MVT::i8));
58509+ SrcVT1.is256BitVector()) {
58510+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58511+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58512+ "Bad subvector index");
58513+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58514+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58515+ unsigned Index = 0;
58516+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58517+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58518+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58519+ DAG.getBitcast(VT, Src0.getOperand(0)),
58520+ DAG.getBitcast(VT, Src1.getOperand(0)),
58521+ DAG.getTargetConstant(Index, DL, MVT::i8));
58522+ }
5851458523 }
5851558524 // Widen extract_subvector
5851658525 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -58672,7 +58681,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5867258681 break;
5867358682 }
5867458683 case X86ISD::SHUFP: {
58675- // TODO: Add SHUFPD support if/when necessary.
5867658684 if (!IsSplat &&
5867758685 (VT == MVT::v8f32 ||
5867858686 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
@@ -58741,18 +58749,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5874158749 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
5874258750 return DAG.getBitcast(VT, Res);
5874358751 }
58744- if (!IsSplat && (VT == MVT::v4f64 || VT == MVT::v8f64)) {
58745- unsigned NumSubElts = Op0.getValueType().getVectorNumElements();
58746- uint64_t Mask = (1ULL << NumSubElts) - 1;
58747- uint64_t Idx = 0;
58748- for (unsigned I = 0; I != NumOps; ++I) {
58749- uint64_t SubIdx = Ops[I].getConstantOperandVal(1);
58750- Idx |= (SubIdx & Mask) << (I * NumSubElts);
58751- }
58752- return DAG.getNode(X86ISD::VPERMILPI, DL, VT,
58753- ConcatSubOperand(VT, Ops, 0),
58754- DAG.getTargetConstant(Idx, DL, MVT::i8));
58755- }
5875658752 break;
5875758753 case X86ISD::VPERMILPV:
5875858754 if (!IsSplat && (VT.is256BitVector() ||
@@ -59323,6 +59319,59 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5932359319 return DAG.getBitcast(VT, Res);
5932459320 }
5932559321
59322+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59323+ if (!IsSplat &&
59324+ (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59325+ all_of(Ops, [](SDValue Op) {
59326+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59327+ Op.getOpcode() == X86ISD::SHUFP ||
59328+ Op.getOpcode() == X86ISD::VPERMILPI ||
59329+ Op.getOpcode() == X86ISD::BLENDI ||
59330+ Op.getOpcode() == X86ISD::UNPCKL ||
59331+ Op.getOpcode() == X86ISD::UNPCKH);
59332+ })) {
59333+ // Collect the individual per-lane v2f64/v4f64 shuffles.
59334+ MVT OpVT = Ops[0].getSimpleValueType();
59335+ unsigned NumOpElts = OpVT.getVectorNumElements();
59336+ SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
59337+ SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
59338+ if (all_of(seq<int>(NumOps), [&](int I) {
59339+ return getTargetShuffleMask(Ops[I], /*AllowSentinelZero=*/false,
59340+ SrcOps[I], SrcMasks[I]) &&
59341+ !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59342+ SrcMasks[I].size() == NumOpElts &&
59343+ all_of(SrcOps[I], [&OpVT](SDValue V) {
59344+ return V.getValueType() == OpVT;
59345+ });
59346+ })) {
59347+ // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59348+ bool Unary = true;
59349+ unsigned SHUFPDMask = 0;
59350+ SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
59351+ for (unsigned I = 0; I != NumOps; ++I) {
59352+ LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59353+ RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59354+ Unary &= LHS[I] == RHS[I];
59355+ for (unsigned J = 0; J != NumOpElts; ++J)
59356+ SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59357+ }
59358+ // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59359+ // PERMILPD mask and we can always profitably concatenate them.
59360+ SDValue Concat0 =
59361+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59362+ SDValue Concat1 =
59363+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59364+ if (Unary || Concat0 || Concat1) {
59365+ Concat0 =
59366+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59367+ Concat1 =
59368+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59369+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59370+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59371+ }
59372+ }
59373+ }
59374+
5932659375 return SDValue();
5932759376}
5932859377
0 commit comments