@@ -53345,7 +53345,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
5334553345}
5334653346
5334753347// Look for a RMW operation that only touches one bit of a larger than legal
53348- // type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
53348+ // type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
53349+ // i32 sub value.
5334953350static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5335053351 SelectionDAG &DAG,
5335153352 const X86Subtarget &Subtarget) {
@@ -53371,28 +53372,42 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5337153372 // BTR: X & ~(1 << ShAmt)
5337253373 // BTS: X | (1 << ShAmt)
5337353374 // BTC: X ^ (1 << ShAmt)
53374- SDValue ShAmt;
53375+ //
53376+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
53377+ SDValue InsertBit, ShAmt;
5337553378 if (!StoredVal.hasOneUse() ||
5337653379 !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
5337753380 m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
5337853381 sd_match(StoredVal,
5337953382 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
5338053383 sd_match(StoredVal,
53381- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
53384+ m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53385+ sd_match(StoredVal,
53386+ m_Or(m_And(m_Specific(LoadVal),
53387+ m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53388+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
5338253389 return SDValue();
5338353390
5338453391 // Ensure the shift amount is in bounds.
5338553392 KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
5338653393 if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
5338753394 return SDValue();
5338853395
53396+ // If we're inserting a bit then it must be the LSB.
53397+ if (InsertBit) {
53398+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
53399+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
53400+ return SDValue();
53401+ }
53402+
5338953403 // Split the shift into an alignment shift that moves the active i32 block to
5339053404 // the bottom bits for truncation and a modulo shift that can act on the i32.
5339153405 EVT AmtVT = ShAmt.getValueType();
5339253406 SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
5339353407 DAG.getSignedConstant(-32LL, DL, AmtVT));
5339453408 SDValue ModuloAmt =
5339553409 DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
53410+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
5339653411
5339753412 // Compute the byte offset for the i32 block that is changed by the RMW.
5339853413 // combineTruncate will adjust the load for us in a similar way.
@@ -53407,13 +53422,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5340753422 SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
5340853423 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
5340953424
53410- SDValue Mask =
53411- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
53412- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
53413- if (StoredVal.getOpcode() == ISD::AND)
53414- Mask = DAG.getNOT(DL, Mask, MVT::i32);
53425+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
53426+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
53427+
53428+ SDValue Res;
53429+ if (InsertBit) {
53430+ SDValue BitMask =
53431+ DAG.getNode(ISD::SHL, DL, MVT::i32,
53432+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
53433+ Res =
53434+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
53435+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
53436+ } else {
53437+ if (StoredVal.getOpcode() == ISD::AND)
53438+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
53439+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
53440+ }
5341553441
53416- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
5341753442 return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
5341853443 Align(), St->getMemOperand()->getFlags());
5341953444}
0 commit comments