Skip to content

Commit 4214ca9

Browse files
committed
[X86][AVX] Attempt to fold vpermf128(op(x,i),op(y,i)) -> op(vpermf128(x,y),i)
If vpermf128/vpermi128 is acting on 2 similar 'inlane' ops, then try to perform the vpermf128 first which will allow us to merge the ops. This will help us fix one of the regressions in D56387
1 parent c161775 commit 4214ca9

File tree

2 files changed

+44
-5
lines changed

2 files changed

+44
-5
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36665,6 +36665,43 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
3666536665
return SDValue();
3666636666
}
3666736667

36668+
/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
36669+
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
36670+
SelectionDAG &DAG,
36671+
const SDLoc &DL) {
36672+
assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
36673+
36674+
MVT VT = V.getSimpleValueType();
36675+
SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
36676+
SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
36677+
unsigned SrcOpc0 = Src0.getOpcode();
36678+
unsigned SrcOpc1 = Src1.getOpcode();
36679+
EVT SrcVT0 = Src0.getValueType();
36680+
EVT SrcVT1 = Src1.getValueType();
36681+
36682+
// TODO: Under what circumstances should we push perm2f128 up when we have one
36683+
// active src?
36684+
if (SrcOpc0 != SrcOpc1 || SrcVT0 != SrcVT1)
36685+
return SDValue();
36686+
36687+
switch (SrcOpc0) {
36688+
case X86ISD::VSHLI:
36689+
case X86ISD::VSRLI:
36690+
case X86ISD::VSRAI:
36691+
if (Src0.getOperand(1) == Src1.getOperand(1)) {
36692+
SDValue Res = DAG.getNode(
36693+
X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)),
36694+
DAG.getBitcast(VT, Src1.getOperand(0)), V.getOperand(2));
36695+
Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
36696+
Src0.getOperand(1));
36697+
return DAG.getBitcast(VT, Res);
36698+
}
36699+
break;
36700+
}
36701+
36702+
return SDValue();
36703+
}
36704+
3666836705
/// Try to combine x86 target specific shuffles.
3666936706
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3667036707
TargetLowering::DAGCombinerInfo &DCI,
@@ -37045,6 +37082,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3704537082
return SDValue();
3704637083
}
3704737084
case X86ISD::VPERM2X128: {
37085+
if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
37086+
return Res;
37087+
3704837088
// If both 128-bit values were inserted into high halves of 256-bit values,
3704937089
// the shuffle can be reduced to a concatenation of subvectors:
3705037090
// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
@@ -37053,6 +37093,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3705337093
SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
3705437094
SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
3705537095
unsigned Imm = N.getConstantOperandVal(2);
37096+
3705637097
if (!(Imm == 0x31 &&
3705737098
Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
3705837099
Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&

llvm/test/CodeGen/X86/vector-trunc.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,9 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
107107
;
108108
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
109109
; AVX2-SLOW: # %bb.0: # %entry
110-
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
111-
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
112-
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
113-
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
114-
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
110+
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
111+
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
112+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
115113
; AVX2-SLOW-NEXT: retq
116114
;
117115
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:

0 commit comments

Comments
 (0)