Skip to content

Commit b6b464d

Browse files
committed
[X86] narrowBitOpRMW - add handling for single bit insertion patterns (REAPPLIED)
Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. REAPPLIED from llvm#165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by llvm#166160
1 parent 6ad25c5 commit b6b464d

File tree

2 files changed

+114
-881
lines changed

2 files changed

+114
-881
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53349,7 +53349,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
5334953349
}
5335053350

5335153351
// Look for a RMW operation that only touches one bit of a larger than legal
53352-
// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
53352+
// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
53353+
// i32 sub value.
5335353354
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5335453355
SelectionDAG &DAG,
5335553356
const X86Subtarget &Subtarget) {
@@ -53375,28 +53376,42 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5337553376
// BTR: X & ~(1 << ShAmt)
5337653377
// BTS: X | (1 << ShAmt)
5337753378
// BTC: X ^ (1 << ShAmt)
53378-
SDValue ShAmt;
53379+
//
53380+
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
53381+
SDValue InsertBit, ShAmt;
5337953382
if (!StoredVal.hasOneUse() ||
5338053383
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
5338153384
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
5338253385
sd_match(StoredVal,
5338353386
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
5338453387
sd_match(StoredVal,
53385-
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
53388+
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53389+
sd_match(StoredVal,
53390+
m_Or(m_And(m_Specific(LoadVal),
53391+
m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53392+
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
5338653393
return SDValue();
5338753394

5338853395
// Ensure the shift amount is in bounds.
5338953396
KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
5339053397
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
5339153398
return SDValue();
5339253399

53400+
// If we're inserting a bit then it must be the LSB.
53401+
if (InsertBit) {
53402+
KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
53403+
if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
53404+
return SDValue();
53405+
}
53406+
5339353407
// Split the shift into an alignment shift that moves the active i32 block to
5339453408
// the bottom bits for truncation and a modulo shift that can act on the i32.
5339553409
EVT AmtVT = ShAmt.getValueType();
5339653410
SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
5339753411
DAG.getSignedConstant(-32LL, DL, AmtVT));
5339853412
SDValue ModuloAmt =
5339953413
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
53414+
ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
5340053415

5340153416
// Compute the byte offset for the i32 block that is changed by the RMW.
5340253417
// combineTruncate will adjust the load for us in a similar way.
@@ -53411,13 +53426,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5341153426
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
5341253427
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
5341353428

53414-
SDValue Mask =
53415-
DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
53416-
DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
53417-
if (StoredVal.getOpcode() == ISD::AND)
53418-
Mask = DAG.getNOT(DL, Mask, MVT::i32);
53429+
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
53430+
DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
53431+
53432+
SDValue Res;
53433+
if (InsertBit) {
53434+
SDValue BitMask =
53435+
DAG.getNode(ISD::SHL, DL, MVT::i32,
53436+
DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
53437+
Res =
53438+
DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
53439+
Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
53440+
} else {
53441+
if (StoredVal.getOpcode() == ISD::AND)
53442+
Mask = DAG.getNOT(DL, Mask, MVT::i32);
53443+
Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
53444+
}
5341953445

53420-
SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
5342153446
return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
5342253447
Align(), St->getMemOperand()->getFlags());
5342353448
}

0 commit comments

Comments
 (0)