@@ -39651,13 +39651,6 @@ static bool matchBinaryPermuteShuffle(
3965139651 return false;
3965239652}
3965339653
39654- static SDValue combineX86ShuffleChainWithExtract(
39655- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39656- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39657- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39658- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39659- const X86Subtarget &Subtarget);
39660-
3966139654/// Combine an arbitrary chain of shuffles into a single instruction if
3966239655/// possible.
3966339656///
@@ -40201,14 +40194,6 @@ static SDValue combineX86ShuffleChain(
4020140194 return DAG.getBitcast(RootVT, Res);
4020240195 }
4020340196
40204- // If that failed and either input is extracted then try to combine as a
40205- // shuffle with the larger type.
40206- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40207- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40208- AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40209- IsMaskedShuffle, DAG, DL, Subtarget))
40210- return WideShuffle;
40211-
4021240197 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
4021340198 // (non-VLX will pad to 512-bit shuffles).
4021440199 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40374,14 +40359,6 @@ static SDValue combineX86ShuffleChain(
4037440359 return DAG.getBitcast(RootVT, Res);
4037540360 }
4037640361
40377- // If that failed and either input is extracted then try to combine as a
40378- // shuffle with the larger type.
40379- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40380- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40381- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40382- DAG, DL, Subtarget))
40383- return WideShuffle;
40384-
4038540362 // If we have a dual input shuffle then lower to VPERMV3,
4038640363 // (non-VLX will pad to 512-bit shuffles)
4038740364 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40407,148 +40384,6 @@ static SDValue combineX86ShuffleChain(
4040740384 return SDValue();
4040840385}
4040940386
40410- // Combine an arbitrary chain of shuffles + extract_subvectors into a single
40411- // instruction if possible.
40412- //
40413- // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40414- // type size to attempt to combine:
40415- // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40416- // -->
40417- // extract_subvector(shuffle(x,y,m2),0)
40418- static SDValue combineX86ShuffleChainWithExtract(
40419- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40420- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40421- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40422- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40423- const X86Subtarget &Subtarget) {
40424- unsigned NumMaskElts = BaseMask.size();
40425- unsigned NumInputs = Inputs.size();
40426- if (NumInputs == 0)
40427- return SDValue();
40428-
40429- unsigned RootSizeInBits = RootVT.getSizeInBits();
40430- unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40431- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40432-
40433- // Peek through subvectors to find widest legal vector.
40434- // TODO: Handle ISD::TRUNCATE
40435- unsigned WideSizeInBits = RootSizeInBits;
40436- for (SDValue Input : Inputs) {
40437- Input = peekThroughBitcasts(Input);
40438- while (1) {
40439- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40440- Input = peekThroughBitcasts(Input.getOperand(0));
40441- continue;
40442- }
40443- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40444- Input.getOperand(0).isUndef()) {
40445- Input = peekThroughBitcasts(Input.getOperand(1));
40446- continue;
40447- }
40448- break;
40449- }
40450- if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40451- WideSizeInBits < Input.getValueSizeInBits())
40452- WideSizeInBits = Input.getValueSizeInBits();
40453- }
40454-
40455- // Bail if we fail to find a source larger than the existing root.
40456- if (WideSizeInBits <= RootSizeInBits ||
40457- (WideSizeInBits % RootSizeInBits) != 0)
40458- return SDValue();
40459-
40460- // Create new mask for larger type.
40461- SmallVector<int, 64> WideMask;
40462- growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40463-
40464- // Attempt to peek through inputs and adjust mask when we extract from an
40465- // upper subvector.
40466- int AdjustedMasks = 0;
40467- SmallVector<SDValue, 4> WideInputs(Inputs);
40468- for (unsigned I = 0; I != NumInputs; ++I) {
40469- SDValue &Input = WideInputs[I];
40470- Input = peekThroughBitcasts(Input);
40471- while (1) {
40472- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40473- Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40474- uint64_t Idx = Input.getConstantOperandVal(1);
40475- if (Idx != 0) {
40476- ++AdjustedMasks;
40477- unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40478- Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40479-
40480- int lo = I * WideMask.size();
40481- int hi = (I + 1) * WideMask.size();
40482- for (int &M : WideMask)
40483- if (lo <= M && M < hi)
40484- M += Idx;
40485- }
40486- Input = peekThroughBitcasts(Input.getOperand(0));
40487- continue;
40488- }
40489- // TODO: Handle insertions into upper subvectors.
40490- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40491- Input.getOperand(0).isUndef() &&
40492- isNullConstant(Input.getOperand(2))) {
40493- Input = peekThroughBitcasts(Input.getOperand(1));
40494- continue;
40495- }
40496- break;
40497- }
40498- }
40499-
40500- // Remove unused/repeated shuffle source ops.
40501- resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40502- assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40503-
40504- // Bail if we're always extracting from the lowest subvectors,
40505- // combineX86ShuffleChain should match this for the current width, or the
40506- // shuffle still references too many inputs.
40507- if (AdjustedMasks == 0 || WideInputs.size() > 2)
40508- return SDValue();
40509-
40510- // Minor canonicalization of the accumulated shuffle mask to make it easier
40511- // to match below. All this does is detect masks with sequential pairs of
40512- // elements, and shrink them to the half-width mask. It does this in a loop
40513- // so it will reduce the size of the mask to the minimal width mask which
40514- // performs an equivalent shuffle.
40515- while (WideMask.size() > 1) {
40516- SmallVector<int, 64> WidenedMask;
40517- if (!canWidenShuffleElements(WideMask, WidenedMask))
40518- break;
40519- WideMask = std::move(WidenedMask);
40520- }
40521-
40522- // Canonicalization of binary shuffle masks to improve pattern matching by
40523- // commuting the inputs.
40524- if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40525- ShuffleVectorSDNode::commuteMask(WideMask);
40526- std::swap(WideInputs[0], WideInputs[1]);
40527- }
40528-
40529- // Increase depth for every upper subvector we've peeked through.
40530- Depth += AdjustedMasks;
40531-
40532- // Attempt to combine wider chain.
40533- // TODO: Can we use a better Root?
40534- SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40535- WideInputs.back().getValueSizeInBits()
40536- ? WideInputs.front()
40537- : WideInputs.back();
40538- assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40539- "WideRootSize mismatch");
40540-
40541- if (SDValue WideShuffle = combineX86ShuffleChain(
40542- WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40543- Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40544- IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40545- WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40546- return DAG.getBitcast(RootVT, WideShuffle);
40547- }
40548-
40549- return SDValue();
40550- }
40551-
4055240387// Canonicalize the combined shuffle mask chain with horizontal ops.
4055340388// NOTE: This may update the Ops and Mask.
4055440389static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -40961,6 +40796,54 @@ static SDValue combineX86ShufflesRecursively(
4096140796 OpMask.assign(NumElts, SM_SentinelUndef);
4096240797 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
4096340798 OpZero = OpUndef = APInt::getZero(NumElts);
40799+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40800+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
40801+ Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
40802+ (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
40803+ // Extracting from vector larger than RootVT - scale the mask and attempt to
40804+ // fold the shuffle with the larger root type, then extract the lower
40805+ // elements.
40806+ unsigned NewRootSizeInBits = Op.getOperand(0).getValueSizeInBits();
40807+ unsigned Scale = NewRootSizeInBits / RootSizeInBits;
40808+ MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
40809+ Scale * RootVT.getVectorNumElements());
40810+ SmallVector<int, 64> NewRootMask;
40811+ growShuffleMask(RootMask, NewRootMask, RootSizeInBits, NewRootSizeInBits);
40812+ // If we're using the lowest subvector, just replace it directly in the src
40813+ // ops/nodes.
40814+ SmallVector<SDValue, 16> NewSrcOps(SrcOps);
40815+ SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
40816+ if (isNullConstant(Op.getOperand(1))) {
40817+ NewSrcOps[SrcOpIndex] = Op.getOperand(0);
40818+ NewSrcNodes.push_back(Op.getNode());
40819+ }
40820+ // Don't increase the combine depth - we're effectively working on the same
40821+ // nodes, just with a wider type.
40822+ if (SDValue WideShuffle = combineX86ShufflesRecursively(
40823+ NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
40824+ Depth, MaxDepth, AllowVariableCrossLaneMask,
40825+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
40826+ return DAG.getBitcast(
40827+ RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
40828+ return SDValue();
40829+ } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
40830+ Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40831+ Op.getOperand(1).getOperand(0).getValueSizeInBits() >
40832+ RootSizeInBits) {
40833+ // If we're inserting an subvector extracted from a vector larger than
40834+ // RootVT, then combine the insert_subvector as a shuffle, the
40835+ // extract_subvector will be folded in a later recursion.
40836+ SDValue BaseVec = Op.getOperand(0);
40837+ SDValue SubVec = Op.getOperand(1);
40838+ int InsertIdx = Op.getConstantOperandVal(2);
40839+ unsigned NumBaseElts = VT.getVectorNumElements();
40840+ unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
40841+ OpInputs.assign({BaseVec, SubVec});
40842+ OpMask.resize(NumBaseElts);
40843+ std::iota(OpMask.begin(), OpMask.end(), 0);
40844+ std::iota(OpMask.begin() + InsertIdx,
40845+ OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
40846+ OpZero = OpUndef = APInt::getZero(NumBaseElts);
4096440847 } else {
4096540848 return SDValue();
4096640849 }
@@ -41303,25 +41186,9 @@ static SDValue combineX86ShufflesRecursively(
4130341186 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
4130441187 IsMaskedShuffle, DAG, DL, Subtarget))
4130541188 return Shuffle;
41306-
41307- // If all the operands come from the same larger vector, fallthrough and try
41308- // to use combineX86ShuffleChainWithExtract.
41309- SDValue LHS = peekThroughBitcasts(Ops.front());
41310- SDValue RHS = peekThroughBitcasts(Ops.back());
41311- if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41312- (RootSizeInBits / Mask.size()) != 64 ||
41313- LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41314- RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41315- LHS.getOperand(0) != RHS.getOperand(0))
41316- return SDValue();
4131741189 }
4131841190
41319- // If that failed and any input is extracted then try to combine as a
41320- // shuffle with the larger type.
41321- return combineX86ShuffleChainWithExtract(
41322- Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41323- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41324- DAG, DL, Subtarget);
41191+ return SDValue();
4132541192}
4132641193
4132741194/// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -43921,6 +43788,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4392143788 case X86ISD::UNPCKL:
4392243789 case X86ISD::UNPCKH:
4392343790 case X86ISD::BLENDI:
43791+ case X86ISD::SHUFP:
4392443792 // Integer ops.
4392543793 case X86ISD::PACKSS:
4392643794 case X86ISD::PACKUS:
0 commit comments