Skip to content

Commit 069a68e

Browse files
committed
[X86] Use NSW/NUW flags on ISD::TRUNCATE nodes to improve X86 PACKSS/PACKUS lowering
If the NSW/NUW flags are present, then we can assume the source value is within bounds and saturation will not occur with the PACKSS/PACKUS instructions. Fixes #87485
1 parent c938436 commit 069a68e

File tree

2 files changed

+219
-837
lines changed

2 files changed

+219
-837
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20819,7 +20819,8 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
2081920819
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2082020820
SDValue In, const SDLoc &DL,
2082120821
SelectionDAG &DAG,
20822-
const X86Subtarget &Subtarget) {
20822+
const X86Subtarget &Subtarget,
20823+
const SDNodeFlags Flags = SDNodeFlags()) {
2082320824
// Requires SSE2.
2082420825
if (!Subtarget.hasSSE2())
2082520826
return SDValue();
@@ -20865,7 +20866,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2086520866
// e.g. Masks, zext_in_reg, etc.
2086620867
// Pre-SSE41 we can only use PACKUSWB.
2086720868
KnownBits Known = DAG.computeKnownBits(In);
20868-
if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20869+
if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20870+
(NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
2086920871
PackOpcode = X86ISD::PACKUS;
2087020872
return In;
2087120873
}
@@ -20884,7 +20886,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2088420886
return SDValue();
2088520887

2088620888
unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20887-
if (MinSignBits < NumSignBits) {
20889+
if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
2088820890
PackOpcode = X86ISD::PACKSS;
2088920891
return In;
2089020892
}
@@ -20906,10 +20908,9 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
2090620908
/// This function lowers a vector truncation of 'extended sign-bits' or
2090720909
/// 'extended zero-bits' values.
2090820910
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20909-
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
20910-
const SDLoc &DL,
20911-
const X86Subtarget &Subtarget,
20912-
SelectionDAG &DAG) {
20911+
static SDValue LowerTruncateVecPackWithSignBits(
20912+
MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
20913+
SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
2091320914
MVT SrcVT = In.getSimpleValueType();
2091420915
MVT DstSVT = DstVT.getVectorElementType();
2091520916
MVT SrcSVT = SrcVT.getVectorElementType();
@@ -20931,8 +20932,8 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
2093120932
}
2093220933

2093320934
unsigned PackOpcode;
20934-
if (SDValue Src =
20935-
matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20935+
if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
20936+
Subtarget, Flags))
2093620937
return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
2093720938

2093820939
return SDValue();
@@ -21102,8 +21103,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2110221103
// Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
2110321104
if (!Subtarget.hasAVX512() ||
2110421105
(InVT.is512BitVector() && VT.is256BitVector()))
21105-
if (SDValue SignPack =
21106-
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21106+
if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
21107+
VT, In, DL, Subtarget, DAG, Op->getFlags()))
2110721108
return SignPack;
2110821109

2110921110
// Pre-AVX512 see if we can make use of PACKSS/PACKUS.
@@ -21120,8 +21121,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
2112021121
// Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
2112121122
// concat from subvectors to use VPTRUNC etc.
2112221123
if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21123-
if (SDValue SignPack =
21124-
LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21124+
if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
21125+
VT, In, DL, Subtarget, DAG, Op->getFlags()))
2112521126
return SignPack;
2112621127

2112721128
// vpmovqb/w/d, vpmovdb/w, vpmovwb
@@ -33578,10 +33579,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3357833579

3357933580
// See if there are sufficient leading bits to perform a PACKUS/PACKSS.
3358033581
unsigned PackOpcode;
33581-
if (SDValue Src =
33582-
matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
33583-
if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
33584-
dl, DAG, Subtarget)) {
33582+
if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33583+
Subtarget, N->getFlags())) {
33584+
if (SDValue Res =
33585+
truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
3358533586
Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
3358633587
Results.push_back(Res);
3358733588
return;

0 commit comments

Comments
 (0)