@@ -39646,13 +39646,6 @@ static bool matchBinaryPermuteShuffle(
3964639646 return false;
3964739647}
3964839648
39649- static SDValue combineX86ShuffleChainWithExtract(
39650- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39651- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39652- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39653- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39654- const X86Subtarget &Subtarget);
39655-
3965639649/// Combine an arbitrary chain of shuffles into a single instruction if
3965739650/// possible.
3965839651///
@@ -40197,14 +40190,6 @@ static SDValue combineX86ShuffleChain(
4019740190 return DAG.getBitcast(RootVT, Res);
4019840191 }
4019940192
40200- // If that failed and either input is extracted then try to combine as a
40201- // shuffle with the larger type.
40202- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40203- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40204- AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40205- IsMaskedShuffle, DAG, DL, Subtarget))
40206- return WideShuffle;
40207-
4020840193 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
4020940194 // (non-VLX will pad to 512-bit shuffles).
4021040195 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40370,14 +40355,6 @@ static SDValue combineX86ShuffleChain(
4037040355 return DAG.getBitcast(RootVT, Res);
4037140356 }
4037240357
40373- // If that failed and either input is extracted then try to combine as a
40374- // shuffle with the larger type.
40375- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40376- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40377- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40378- DAG, DL, Subtarget))
40379- return WideShuffle;
40380-
4038140358 // If we have a dual input shuffle then lower to VPERMV3,
4038240359 // (non-VLX will pad to 512-bit shuffles)
4038340360 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40403,148 +40380,6 @@ static SDValue combineX86ShuffleChain(
4040340380 return SDValue();
4040440381}
4040540382
40406- // Combine an arbitrary chain of shuffles + extract_subvectors into a single
40407- // instruction if possible.
40408- //
40409- // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40410- // type size to attempt to combine:
40411- // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40412- // -->
40413- // extract_subvector(shuffle(x,y,m2),0)
40414- static SDValue combineX86ShuffleChainWithExtract(
40415- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40416- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40417- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40418- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40419- const X86Subtarget &Subtarget) {
40420- unsigned NumMaskElts = BaseMask.size();
40421- unsigned NumInputs = Inputs.size();
40422- if (NumInputs == 0)
40423- return SDValue();
40424-
40425- unsigned RootSizeInBits = RootVT.getSizeInBits();
40426- unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40427- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40428-
40429- // Peek through subvectors to find widest legal vector.
40430- // TODO: Handle ISD::TRUNCATE
40431- unsigned WideSizeInBits = RootSizeInBits;
40432- for (SDValue Input : Inputs) {
40433- Input = peekThroughBitcasts(Input);
40434- while (1) {
40435- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40436- Input = peekThroughBitcasts(Input.getOperand(0));
40437- continue;
40438- }
40439- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40440- Input.getOperand(0).isUndef()) {
40441- Input = peekThroughBitcasts(Input.getOperand(1));
40442- continue;
40443- }
40444- break;
40445- }
40446- if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40447- WideSizeInBits < Input.getValueSizeInBits())
40448- WideSizeInBits = Input.getValueSizeInBits();
40449- }
40450-
40451- // Bail if we fail to find a source larger than the existing root.
40452- if (WideSizeInBits <= RootSizeInBits ||
40453- (WideSizeInBits % RootSizeInBits) != 0)
40454- return SDValue();
40455-
40456- // Create new mask for larger type.
40457- SmallVector<int, 64> WideMask;
40458- growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40459-
40460- // Attempt to peek through inputs and adjust mask when we extract from an
40461- // upper subvector.
40462- int AdjustedMasks = 0;
40463- SmallVector<SDValue, 4> WideInputs(Inputs);
40464- for (unsigned I = 0; I != NumInputs; ++I) {
40465- SDValue &Input = WideInputs[I];
40466- Input = peekThroughBitcasts(Input);
40467- while (1) {
40468- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40469- Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40470- uint64_t Idx = Input.getConstantOperandVal(1);
40471- if (Idx != 0) {
40472- ++AdjustedMasks;
40473- unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40474- Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40475-
40476- int lo = I * WideMask.size();
40477- int hi = (I + 1) * WideMask.size();
40478- for (int &M : WideMask)
40479- if (lo <= M && M < hi)
40480- M += Idx;
40481- }
40482- Input = peekThroughBitcasts(Input.getOperand(0));
40483- continue;
40484- }
40485- // TODO: Handle insertions into upper subvectors.
40486- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40487- Input.getOperand(0).isUndef() &&
40488- isNullConstant(Input.getOperand(2))) {
40489- Input = peekThroughBitcasts(Input.getOperand(1));
40490- continue;
40491- }
40492- break;
40493- }
40494- }
40495-
40496- // Remove unused/repeated shuffle source ops.
40497- resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40498- assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40499-
40500- // Bail if we're always extracting from the lowest subvectors,
40501- // combineX86ShuffleChain should match this for the current width, or the
40502- // shuffle still references too many inputs.
40503- if (AdjustedMasks == 0 || WideInputs.size() > 2)
40504- return SDValue();
40505-
40506- // Minor canonicalization of the accumulated shuffle mask to make it easier
40507- // to match below. All this does is detect masks with sequential pairs of
40508- // elements, and shrink them to the half-width mask. It does this in a loop
40509- // so it will reduce the size of the mask to the minimal width mask which
40510- // performs an equivalent shuffle.
40511- while (WideMask.size() > 1) {
40512- SmallVector<int, 64> WidenedMask;
40513- if (!canWidenShuffleElements(WideMask, WidenedMask))
40514- break;
40515- WideMask = std::move(WidenedMask);
40516- }
40517-
40518- // Canonicalization of binary shuffle masks to improve pattern matching by
40519- // commuting the inputs.
40520- if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40521- ShuffleVectorSDNode::commuteMask(WideMask);
40522- std::swap(WideInputs[0], WideInputs[1]);
40523- }
40524-
40525- // Increase depth for every upper subvector we've peeked through.
40526- Depth += AdjustedMasks;
40527-
40528- // Attempt to combine wider chain.
40529- // TODO: Can we use a better Root?
40530- SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40531- WideInputs.back().getValueSizeInBits()
40532- ? WideInputs.front()
40533- : WideInputs.back();
40534- assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40535- "WideRootSize mismatch");
40536-
40537- if (SDValue WideShuffle = combineX86ShuffleChain(
40538- WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40539- Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40540- IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40541- WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40542- return DAG.getBitcast(RootVT, WideShuffle);
40543- }
40544-
40545- return SDValue();
40546- }
40547-
4054840383// Canonicalize the combined shuffle mask chain with horizontal ops.
4054940384// NOTE: This may update the Ops and Mask.
4055040385static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -40957,6 +40792,54 @@ static SDValue combineX86ShufflesRecursively(
4095740792 OpMask.assign(NumElts, SM_SentinelUndef);
4095840793 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
4095940794 OpZero = OpUndef = APInt::getZero(NumElts);
40795+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40796+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
40797+ Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
40798+ (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
40799+ // Extracting from vector larger than RootVT - scale the mask and attempt to
40800+ // fold the shuffle with the larger root type, then extract the lower
40801+ // elements.
40802+ unsigned NewRootSizeInBits = Op.getOperand(0).getValueSizeInBits();
40803+ unsigned Scale = NewRootSizeInBits / RootSizeInBits;
40804+ MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
40805+ Scale * RootVT.getVectorNumElements());
40806+ SmallVector<int, 64> NewRootMask;
40807+ growShuffleMask(RootMask, NewRootMask, RootSizeInBits, NewRootSizeInBits);
40808+ // If we're using the lowest subvector, just replace it directly in the src
40809+ // ops/nodes.
40810+ SmallVector<SDValue, 16> NewSrcOps(SrcOps);
40811+ SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
40812+ if (isNullConstant(Op.getOperand(1))) {
40813+ NewSrcOps[SrcOpIndex] = Op.getOperand(0);
40814+ NewSrcNodes.push_back(Op.getNode());
40815+ }
40816+ // Don't increase the combine depth - we're effectively working on the same
40817+ // nodes, just with a wider type.
40818+ if (SDValue WideShuffle = combineX86ShufflesRecursively(
40819+ NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
40820+ Depth, MaxDepth, AllowVariableCrossLaneMask,
40821+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
40822+ return DAG.getBitcast(
40823+ RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
40824+ return SDValue();
40825+ } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
40826+ Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40827+ Op.getOperand(1).getOperand(0).getValueSizeInBits() >
40828+ RootSizeInBits) {
40829+ // If we're inserting an subvector extracted from a vector larger than
40830+ // RootVT, then combine the insert_subvector as a shuffle, the
40831+ // extract_subvector will be folded in a later recursion.
40832+ SDValue BaseVec = Op.getOperand(0);
40833+ SDValue SubVec = Op.getOperand(1);
40834+ int InsertIdx = Op.getConstantOperandVal(2);
40835+ unsigned NumBaseElts = VT.getVectorNumElements();
40836+ unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
40837+ OpInputs.assign({BaseVec, SubVec});
40838+ OpMask.resize(NumBaseElts);
40839+ std::iota(OpMask.begin(), OpMask.end(), 0);
40840+ std::iota(OpMask.begin() + InsertIdx,
40841+ OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
40842+ OpZero = OpUndef = APInt::getZero(NumBaseElts);
4096040843 } else {
4096140844 return SDValue();
4096240845 }
@@ -41303,25 +41186,9 @@ static SDValue combineX86ShufflesRecursively(
4130341186 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
4130441187 IsMaskedShuffle, DAG, DL, Subtarget))
4130541188 return Shuffle;
41306-
41307- // If all the operands come from the same larger vector, fallthrough and try
41308- // to use combineX86ShuffleChainWithExtract.
41309- SDValue LHS = peekThroughBitcasts(Ops.front());
41310- SDValue RHS = peekThroughBitcasts(Ops.back());
41311- if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41312- (RootSizeInBits / Mask.size()) != 64 ||
41313- LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41314- RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41315- LHS.getOperand(0) != RHS.getOperand(0))
41316- return SDValue();
4131741189 }
4131841190
41319- // If that failed and any input is extracted then try to combine as a
41320- // shuffle with the larger type.
41321- return combineX86ShuffleChainWithExtract(
41322- Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41323- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41324- DAG, DL, Subtarget);
41191+ return SDValue();
4132541192}
4132641193
4132741194/// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -43924,6 +43791,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4392443791 case X86ISD::UNPCKL:
4392543792 case X86ISD::UNPCKH:
4392643793 case X86ISD::BLENDI:
43794+ case X86ISD::SHUFP:
4392743795 // Integer ops.
4392843796 case X86ISD::PACKSS:
4392943797 case X86ISD::PACKUS:
0 commit comments