@@ -5654,6 +5654,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
56545654 MachineMemOperand::MOVolatile;
56555655 return true;
56565656 }
5657+ case Intrinsic::x86_atomic_bts_rm:
5658+ case Intrinsic::x86_atomic_btc_rm:
5659+ case Intrinsic::x86_atomic_btr_rm: {
5660+ Info.opc = ISD::INTRINSIC_W_CHAIN;
5661+ Info.ptrVal = I.getArgOperand(0);
5662+ unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5663+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5664+ Info.align = Align(Size);
5665+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5666+ MachineMemOperand::MOVolatile;
5667+ return true;
5668+ }
56575669 case Intrinsic::x86_aadd32:
56585670 case Intrinsic::x86_aadd64:
56595671 case Intrinsic::x86_aand32:
@@ -28364,6 +28376,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
2836428376 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
2836528377 Operation.getValue(1));
2836628378 }
28379+ case Intrinsic::x86_atomic_bts_rm:
28380+ case Intrinsic::x86_atomic_btc_rm:
28381+ case Intrinsic::x86_atomic_btr_rm: {
28382+ SDLoc DL(Op);
28383+ MVT VT = Op.getSimpleValueType();
28384+ SDValue Chain = Op.getOperand(0);
28385+ SDValue Op1 = Op.getOperand(2);
28386+ SDValue Op2 = Op.getOperand(3);
28387+ unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28388+ : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28389+ : X86ISD::LBTR_RM;
28390+ MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28391+ SDValue Res =
28392+ DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28393+ {Chain, Op1, Op2}, VT, MMO);
28394+ Chain = Res.getValue(1);
28395+ Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28396+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28397+ }
2836728398 case Intrinsic::x86_atomic_bts:
2836828399 case Intrinsic::x86_atomic_btc:
2836928400 case Intrinsic::x86_atomic_btr: {
@@ -31401,6 +31432,75 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
3140131432 : AtomicExpansionKind::None;
3140231433}
3140331434
31435+ enum BitTestKind : unsigned {
31436+ UndefBit,
31437+ ConstantBit,
31438+ NotConstantBit,
31439+ ShiftBit,
31440+ NotShiftBit
31441+ };
31442+
31443+ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31444+ using namespace llvm::PatternMatch;
31445+ BitTestKind BTK = UndefBit;
31446+ auto *C = dyn_cast<ConstantInt>(V);
31447+ if (C) {
31448+ // Check if V is a power of 2 or NOT power of 2.
31449+ if (isPowerOf2_64(C->getZExtValue()))
31450+ BTK = ConstantBit;
31451+ else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31452+ BTK = NotConstantBit;
31453+ return {V, BTK};
31454+ }
31455+
31456+ // Check if V is some power of 2 pattern known to be non-zero
31457+ auto *I = dyn_cast<Instruction>(V);
31458+ if (I) {
31459+ bool Not = false;
31460+ // Check if we have a NOT
31461+ Value *PeekI;
31462+ if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
31463+ match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31464+ Not = true;
31465+ I = dyn_cast<Instruction>(PeekI);
31466+ assert(I != nullptr);
31467+ }
31468+ // We can only use 1 << X without more sophisticated analysis. C << X where
31469+ // C is a power of 2 but not 1 can result in zero which cannot be translated
31470+ // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31471+ if (I->getOpcode() == Instruction::Shl) {
31472+ // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31473+ // -X` and some other provable power of 2 patterns that we can use CTZ on
31474+ // may be profitable.
31475+ // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31476+ // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31477+ // be provably a non-zero power of 2.
31478+ // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31479+ // transformable to bittest.
31480+ auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31481+ if (!ShiftVal)
31482+ return {nullptr, UndefBit};
31483+ if (ShiftVal->equalsInt(1))
31484+ BTK = Not ? NotShiftBit : ShiftBit;
31485+
31486+ if (BTK == UndefBit)
31487+ return {nullptr, UndefBit};
31488+
31489+ Value *BitV = I->getOperand(1);
31490+
31491+ Value *AndOp;
31492+ const APInt *AndC;
31493+ if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
31494+ // Read past a shiftmask instruction to find count
31495+ if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
31496+ BitV = AndOp;
31497+ }
31498+ return {BitV, BTK};
31499+ }
31500+ }
31501+ return {nullptr, UndefBit};
31502+ }
31503+
3140431504TargetLowering::AtomicExpansionKind
3140531505X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
3140631506 // If the atomicrmw's result isn't actually used, we can just add a "lock"
@@ -31410,51 +31510,138 @@ X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
3141031510
3141131511 // If the atomicrmw's result is used by a single bit AND, we may use
3141231512 // bts/btr/btc instruction for these operations.
31413- auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
31513+ // Note: InstCombinePass can cause a de-optimization here. It replaces the
31514+ // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31515+ // (depending on CC). This pattern can only use bts/btr/btc but we don't
31516+ // detect it.
3141431517 Instruction *I = AI->user_back();
31415- if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
31518+ auto BitChange = FindSingleBitChange(AI->getValOperand());
31519+ if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31520+ I->getOpcode() != Instruction::And ||
31521+ AI->getType()->getPrimitiveSizeInBits() == 8 ||
3141631522 AI->getParent() != I->getParent())
3141731523 return AtomicExpansionKind::CmpXChg;
31524+
31525+ assert(I->getOperand(0) == AI);
3141831526 // The following instruction must be a AND single bit.
31419- auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
31420- unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
31421- if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
31527+ if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31528+ auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
31529+ assert(C1 != nullptr);
31530+ auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
31531+ if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31532+ return AtomicExpansionKind::CmpXChg;
31533+ }
31534+ if (AI->getOperation() == AtomicRMWInst::And) {
31535+ return ~C1->getValue() == C2->getValue()
31536+ ? AtomicExpansionKind::BitTestIntrinsic
31537+ : AtomicExpansionKind::CmpXChg;
31538+ }
31539+ return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
31540+ : AtomicExpansionKind::CmpXChg;
31541+ }
31542+
31543+ assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31544+
31545+ auto BitTested = FindSingleBitChange(I->getOperand(1));
31546+ if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31547+ return AtomicExpansionKind::CmpXChg;
31548+
31549+ assert(BitChange.first != nullptr && BitTested.first != nullptr);
31550+
31551+ // If shift amounts are not the same we can't use BitTestIntrinsic.
31552+ if (BitChange.first != BitTested.first)
3142231553 return AtomicExpansionKind::CmpXChg;
3142331554
31555+ // If atomic AND need to be masking all be one bit and testing the one bit
31556+ // unset in the mask.
3142431557 if (AI->getOperation() == AtomicRMWInst::And)
31425- return ~C1->getValue() == C2->getValue( )
31558+ return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit )
3142631559 ? AtomicExpansionKind::BitTestIntrinsic
3142731560 : AtomicExpansionKind::CmpXChg;
3142831561
31429- return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
31430- : AtomicExpansionKind::CmpXChg;
31562+ // If atomic XOR/OR need to be setting and testing the same bit.
31563+ return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31564+ ? AtomicExpansionKind::BitTestIntrinsic
31565+ : AtomicExpansionKind::CmpXChg;
3143131566}
3143231567
3143331568void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
3143431569 IRBuilder<> Builder(AI);
31435- Intrinsic::ID IID = Intrinsic::not_intrinsic;
31570+ Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
31571+ Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
3143631572 switch (AI->getOperation()) {
3143731573 default:
3143831574 llvm_unreachable("Unknown atomic operation");
3143931575 case AtomicRMWInst::Or:
31440- IID = Intrinsic::x86_atomic_bts;
31576+ IID_C = Intrinsic::x86_atomic_bts;
31577+ IID_I = Intrinsic::x86_atomic_bts_rm;
3144131578 break;
3144231579 case AtomicRMWInst::Xor:
31443- IID = Intrinsic::x86_atomic_btc;
31580+ IID_C = Intrinsic::x86_atomic_btc;
31581+ IID_I = Intrinsic::x86_atomic_btc_rm;
3144431582 break;
3144531583 case AtomicRMWInst::And:
31446- IID = Intrinsic::x86_atomic_btr;
31584+ IID_C = Intrinsic::x86_atomic_btr;
31585+ IID_I = Intrinsic::x86_atomic_btr_rm;
3144731586 break;
3144831587 }
3144931588 Instruction *I = AI->user_back();
3145031589 LLVMContext &Ctx = AI->getContext();
31451- unsigned Imm =
31452- countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
31453- Function *BitTest =
31454- Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
3145531590 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
3145631591 Type::getInt8PtrTy(Ctx));
31457- Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
31592+ Function *BitTest = nullptr;
31593+ Value *Result = nullptr;
31594+ auto BitTested = FindSingleBitChange(AI->getValOperand());
31595+ assert(BitTested.first != nullptr);
31596+ if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31597+ auto *C = dyn_cast<ConstantInt>(I->getOperand(1));
31598+ assert(C != nullptr);
31599+
31600+ BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
31601+
31602+ unsigned Imm = countTrailingZeros(C->getZExtValue());
31603+ Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
31604+ } else {
31605+ BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
31606+
31607+ assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31608+
31609+ Value *SI = BitTested.first;
31610+ assert(SI != nullptr);
31611+
31612+ // BT{S|R|C} on memory operand don't modulo bit position so we need to
31613+ // mask it.
31614+ unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31615+ Value *BitPos =
31616+ Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31617+ // Todo(1): In many cases it may be provable that SI is less than
31618+ // ShiftBits in which case this mask is unnecessary
31619+ // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31620+ // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31621+ // favor of just a raw BT{S|R|C}.
31622+
31623+ Result = Builder.CreateCall(BitTest, {Addr, BitPos});
31624+ Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31625+
31626+ // If the result is only used for zero/non-zero status then we don't need to
31627+ // shift value back. Otherwise do so.
31628+ for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31629+ if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31630+ if (ICmp->isEquality()) {
31631+ auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31632+ auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31633+ if (C0 || C1) {
31634+ assert(C0 == nullptr || C1 == nullptr);
31635+ if ((C0 ? C0 : C1)->isZero())
31636+ continue;
31637+ }
31638+ }
31639+ }
31640+ Result = Builder.CreateShl(Result, BitPos);
31641+ break;
31642+ }
31643+ }
31644+
3145831645 I->replaceAllUsesWith(Result);
3145931646 I->eraseFromParent();
3146031647 AI->eraseFromParent();
@@ -34242,6 +34429,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3424234429 NODE_NAME_CASE(LBTS)
3424334430 NODE_NAME_CASE(LBTC)
3424434431 NODE_NAME_CASE(LBTR)
34432+ NODE_NAME_CASE(LBTS_RM)
34433+ NODE_NAME_CASE(LBTC_RM)
34434+ NODE_NAME_CASE(LBTR_RM)
3424534435 NODE_NAME_CASE(AADD)
3424634436 NODE_NAME_CASE(AOR)
3424734437 NODE_NAME_CASE(AXOR)
0 commit comments