@@ -39602,7 +39602,7 @@ static bool matchBinaryPermuteShuffle(
3960239602static SDValue combineX86ShuffleChainWithExtract(
3960339603 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
3960439604 ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
39605- bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39605+ bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
3960639606 const X86Subtarget &Subtarget);
3960739607
3960839608/// Combine an arbitrary chain of shuffles into a single instruction if
@@ -39619,6 +39619,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3961939619 ArrayRef<const SDNode *> SrcNodes,
3962039620 bool AllowVariableCrossLaneMask,
3962139621 bool AllowVariablePerLaneMask,
39622+ bool IsMaskedShuffle,
3962239623 SelectionDAG &DAG,
3962339624 const X86Subtarget &Subtarget) {
3962439625 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
@@ -39666,17 +39667,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3966639667 (RootVT.isFloatingPoint() && Depth >= 1) ||
3966739668 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
3966839669
39669- // Don't combine if we are a AVX512/EVEX target and the mask element size
39670- // is different from the root element size - this would prevent writemasks
39671- // from being reused.
39672- bool IsMaskedShuffle = false;
39673- if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39674- if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39675- Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39676- IsMaskedShuffle = true;
39677- }
39678- }
39679-
3968039670 // If we are shuffling a splat (and not introducing zeros) then we can just
3968139671 // use it directly. This works for smaller elements as well as they already
3968239672 // repeat across each mask element.
@@ -40167,7 +40157,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
4016740157 // shuffle with the larger type.
4016840158 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
4016940159 Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
40170- AllowVariablePerLaneMask, DAG, Subtarget))
40160+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
4017140161 return WideShuffle;
4017240162
4017340163 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
@@ -40339,7 +40329,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
4033940329 // shuffle with the larger type.
4034040330 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
4034140331 Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
40342- AllowVariablePerLaneMask, DAG, Subtarget))
40332+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
4034340333 return WideShuffle;
4034440334
4034540335 // If we have a dual input shuffle then lower to VPERMV3,
@@ -40378,7 +40368,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
4037840368static SDValue combineX86ShuffleChainWithExtract(
4037940369 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
4038040370 ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
40381- bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40371+ bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
4038240372 const X86Subtarget &Subtarget) {
4038340373 unsigned NumMaskElts = BaseMask.size();
4038440374 unsigned NumInputs = Inputs.size();
@@ -40504,10 +40494,10 @@ static SDValue combineX86ShuffleChainWithExtract(
4050440494 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
4050540495 "WideRootSize mismatch");
4050640496
40507- if (SDValue WideShuffle =
40508- combineX86ShuffleChain( WideInputs, WideRoot, WideMask, Depth,
40509- SrcNodes, AllowVariableCrossLaneMask ,
40510- AllowVariablePerLaneMask, DAG, Subtarget)) {
40497+ if (SDValue WideShuffle = combineX86ShuffleChain(
40498+ WideInputs, WideRoot, WideMask, Depth, SrcNodes ,
40499+ AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle ,
40500+ DAG, Subtarget)) {
4051140501 WideShuffle =
4051240502 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
4051340503 return DAG.getBitcast(RootVT, WideShuffle);
@@ -41244,6 +41234,16 @@ static SDValue combineX86ShufflesRecursively(
4124441234 resolveTargetShuffleInputsAndMask(Ops, Mask);
4124541235 }
4124641236
41237+ // If we are a AVX512/EVEX target the mask element size should match the root
41238+ // element size to allow writemasks to be reused.
41239+ bool IsMaskedShuffle = false;
41240+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
41241+ if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
41242+ Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
41243+ IsMaskedShuffle = true;
41244+ }
41245+ }
41246+
4124741247 // We can only combine unary and binary shuffle mask cases.
4124841248 if (Ops.size() <= 2) {
4124941249 // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -41268,7 +41268,7 @@ static SDValue combineX86ShufflesRecursively(
4126841268 // Try to combine into a single shuffle instruction.
4126941269 if (SDValue Shuffle = combineX86ShuffleChain(
4127041270 Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
41271- AllowVariablePerLaneMask, DAG, Subtarget))
41271+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
4127241272 return Shuffle;
4127341273
4127441274 // If all the operands come from the same larger vector, fallthrough and try
@@ -41287,7 +41287,7 @@ static SDValue combineX86ShufflesRecursively(
4128741287 // shuffle with the larger type.
4128841288 return combineX86ShuffleChainWithExtract(
4128941289 Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
41290- AllowVariablePerLaneMask, DAG, Subtarget);
41290+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget);
4129141291}
4129241292
4129341293/// Helper entry wrapper to combineX86ShufflesRecursively.
0 commit comments