@@ -36665,6 +36665,43 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
36665
36665
return SDValue();
36666
36666
}
36667
36667
36668
+ /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
36669
+ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
36670
+ SelectionDAG &DAG,
36671
+ const SDLoc &DL) {
36672
+ assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
36673
+
36674
+ MVT VT = V.getSimpleValueType();
36675
+ SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
36676
+ SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
36677
+ unsigned SrcOpc0 = Src0.getOpcode();
36678
+ unsigned SrcOpc1 = Src1.getOpcode();
36679
+ EVT SrcVT0 = Src0.getValueType();
36680
+ EVT SrcVT1 = Src1.getValueType();
36681
+
36682
+ // TODO: Under what circumstances should we push perm2f128 up when we have one
36683
+ // active src?
36684
+ if (SrcOpc0 != SrcOpc1 || SrcVT0 != SrcVT1)
36685
+ return SDValue();
36686
+
36687
+ switch (SrcOpc0) {
36688
+ case X86ISD::VSHLI:
36689
+ case X86ISD::VSRLI:
36690
+ case X86ISD::VSRAI:
36691
+ if (Src0.getOperand(1) == Src1.getOperand(1)) {
36692
+ SDValue Res = DAG.getNode(
36693
+ X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)),
36694
+ DAG.getBitcast(VT, Src1.getOperand(0)), V.getOperand(2));
36695
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
36696
+ Src0.getOperand(1));
36697
+ return DAG.getBitcast(VT, Res);
36698
+ }
36699
+ break;
36700
+ }
36701
+
36702
+ return SDValue();
36703
+ }
36704
+
36668
36705
/// Try to combine x86 target specific shuffles.
36669
36706
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
36670
36707
TargetLowering::DAGCombinerInfo &DCI,
@@ -37045,6 +37082,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37045
37082
return SDValue();
37046
37083
}
37047
37084
case X86ISD::VPERM2X128: {
37085
+ if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
37086
+ return Res;
37087
+
37048
37088
// If both 128-bit values were inserted into high halves of 256-bit values,
37049
37089
// the shuffle can be reduced to a concatenation of subvectors:
37050
37090
// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
@@ -37053,6 +37093,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37053
37093
SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
37054
37094
SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
37055
37095
unsigned Imm = N.getConstantOperandVal(2);
37096
+
37056
37097
if (!(Imm == 0x31 &&
37057
37098
Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
37058
37099
Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
0 commit comments