Skip to content

Commit 0b74e34

Browse files
committed
Transform AtomicRMW logic operations to BT{R|C|S} if only changing/testing a single bit.
This is essentially expanding on the optimizations added on: D120199 but applies the optimization to cases where the bit being changed / tested is not am IMM but is a provable power of 2. The only case currently added for cases like: `__atomic_fetch_xor(p, 1 << c, __ATOMIC_RELAXED) & (1 << c)` Which instead of using a `cmpxchg` loop can be done with `btcl; setcc; shl`. There are still a variety of missed cases that could/should be addressed in the future. This commit documents many of those cases with Todos. Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D140939
1 parent 282d5a5 commit 0b74e34

File tree

6 files changed

+694
-1173
lines changed

6 files changed

+694
-1173
lines changed

llvm/include/llvm/IR/IntrinsicsX86.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ let TargetPrefix = "x86" in {
7070
[ImmArg<ArgIndex<1>>]>;
7171
def int_x86_atomic_btr : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i8_ty],
7272
[ImmArg<ArgIndex<1>>]>;
73+
def int_x86_atomic_bts_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
74+
[]>;
75+
def int_x86_atomic_btc_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
76+
[]>;
77+
def int_x86_atomic_btr_rm : Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_anyint_ty],
78+
[]>;
79+
80+
7381
}
7482

7583
// Lock binary arith with CC.

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 207 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5654,6 +5654,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
56545654
MachineMemOperand::MOVolatile;
56555655
return true;
56565656
}
5657+
case Intrinsic::x86_atomic_bts_rm:
5658+
case Intrinsic::x86_atomic_btc_rm:
5659+
case Intrinsic::x86_atomic_btr_rm: {
5660+
Info.opc = ISD::INTRINSIC_W_CHAIN;
5661+
Info.ptrVal = I.getArgOperand(0);
5662+
unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5663+
Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5664+
Info.align = Align(Size);
5665+
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5666+
MachineMemOperand::MOVolatile;
5667+
return true;
5668+
}
56575669
case Intrinsic::x86_aadd32:
56585670
case Intrinsic::x86_aadd64:
56595671
case Intrinsic::x86_aand32:
@@ -28364,6 +28376,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
2836428376
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
2836528377
Operation.getValue(1));
2836628378
}
28379+
case Intrinsic::x86_atomic_bts_rm:
28380+
case Intrinsic::x86_atomic_btc_rm:
28381+
case Intrinsic::x86_atomic_btr_rm: {
28382+
SDLoc DL(Op);
28383+
MVT VT = Op.getSimpleValueType();
28384+
SDValue Chain = Op.getOperand(0);
28385+
SDValue Op1 = Op.getOperand(2);
28386+
SDValue Op2 = Op.getOperand(3);
28387+
unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28388+
: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28389+
: X86ISD::LBTR_RM;
28390+
MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28391+
SDValue Res =
28392+
DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28393+
{Chain, Op1, Op2}, VT, MMO);
28394+
Chain = Res.getValue(1);
28395+
Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28396+
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28397+
}
2836728398
case Intrinsic::x86_atomic_bts:
2836828399
case Intrinsic::x86_atomic_btc:
2836928400
case Intrinsic::x86_atomic_btr: {
@@ -31401,6 +31432,75 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
3140131432
: AtomicExpansionKind::None;
3140231433
}
3140331434

31435+
enum BitTestKind : unsigned {
31436+
UndefBit,
31437+
ConstantBit,
31438+
NotConstantBit,
31439+
ShiftBit,
31440+
NotShiftBit
31441+
};
31442+
31443+
static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31444+
using namespace llvm::PatternMatch;
31445+
BitTestKind BTK = UndefBit;
31446+
auto *C = dyn_cast<ConstantInt>(V);
31447+
if (C) {
31448+
// Check if V is a power of 2 or NOT power of 2.
31449+
if (isPowerOf2_64(C->getZExtValue()))
31450+
BTK = ConstantBit;
31451+
else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31452+
BTK = NotConstantBit;
31453+
return {V, BTK};
31454+
}
31455+
31456+
// Check if V is some power of 2 pattern known to be non-zero
31457+
auto *I = dyn_cast<Instruction>(V);
31458+
if (I) {
31459+
bool Not = false;
31460+
// Check if we have a NOT
31461+
Value *PeekI;
31462+
if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
31463+
match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31464+
Not = true;
31465+
I = dyn_cast<Instruction>(PeekI);
31466+
assert(I != nullptr);
31467+
}
31468+
// We can only use 1 << X without more sophisticated analysis. C << X where
31469+
// C is a power of 2 but not 1 can result in zero which cannot be translated
31470+
// to bittest. Likewise any C >> X (either arith or logical) can be zero.
31471+
if (I->getOpcode() == Instruction::Shl) {
31472+
// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31473+
// -X` and some other provable power of 2 patterns that we can use CTZ on
31474+
// may be profitable.
31475+
// Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31476+
// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31477+
// be provably a non-zero power of 2.
31478+
// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31479+
// transformable to bittest.
31480+
auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31481+
if (!ShiftVal)
31482+
return {nullptr, UndefBit};
31483+
if (ShiftVal->equalsInt(1))
31484+
BTK = Not ? NotShiftBit : ShiftBit;
31485+
31486+
if (BTK == UndefBit)
31487+
return {nullptr, UndefBit};
31488+
31489+
Value *BitV = I->getOperand(1);
31490+
31491+
Value *AndOp;
31492+
const APInt *AndC;
31493+
if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
31494+
// Read past a shiftmask instruction to find count
31495+
if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
31496+
BitV = AndOp;
31497+
}
31498+
return {BitV, BTK};
31499+
}
31500+
}
31501+
return {nullptr, UndefBit};
31502+
}
31503+
3140431504
TargetLowering::AtomicExpansionKind
3140531505
X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
3140631506
// If the atomicrmw's result isn't actually used, we can just add a "lock"
@@ -31410,51 +31510,138 @@ X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
3141031510

3141131511
// If the atomicrmw's result is used by a single bit AND, we may use
3141231512
// bts/btr/btc instruction for these operations.
31413-
auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
31513+
// Note: InstCombinePass can cause a de-optimization here. It replaces the
31514+
// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31515+
// (depending on CC). This pattern can only use bts/btr/btc but we don't
31516+
// detect it.
3141431517
Instruction *I = AI->user_back();
31415-
if (!C1 || !AI->hasOneUse() || I->getOpcode() != Instruction::And ||
31518+
auto BitChange = FindSingleBitChange(AI->getValOperand());
31519+
if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31520+
I->getOpcode() != Instruction::And ||
31521+
AI->getType()->getPrimitiveSizeInBits() == 8 ||
3141631522
AI->getParent() != I->getParent())
3141731523
return AtomicExpansionKind::CmpXChg;
31524+
31525+
assert(I->getOperand(0) == AI);
3141831526
// The following instruction must be a AND single bit.
31419-
auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
31420-
unsigned Bits = AI->getType()->getPrimitiveSizeInBits();
31421-
if (!C2 || Bits == 8 || !isPowerOf2_64(C2->getZExtValue()))
31527+
if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31528+
auto *C1 = dyn_cast<ConstantInt>(AI->getValOperand());
31529+
assert(C1 != nullptr);
31530+
auto *C2 = dyn_cast<ConstantInt>(I->getOperand(1));
31531+
if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31532+
return AtomicExpansionKind::CmpXChg;
31533+
}
31534+
if (AI->getOperation() == AtomicRMWInst::And) {
31535+
return ~C1->getValue() == C2->getValue()
31536+
? AtomicExpansionKind::BitTestIntrinsic
31537+
: AtomicExpansionKind::CmpXChg;
31538+
}
31539+
return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
31540+
: AtomicExpansionKind::CmpXChg;
31541+
}
31542+
31543+
assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31544+
31545+
auto BitTested = FindSingleBitChange(I->getOperand(1));
31546+
if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31547+
return AtomicExpansionKind::CmpXChg;
31548+
31549+
assert(BitChange.first != nullptr && BitTested.first != nullptr);
31550+
31551+
// If shift amounts are not the same we can't use BitTestIntrinsic.
31552+
if (BitChange.first != BitTested.first)
3142231553
return AtomicExpansionKind::CmpXChg;
3142331554

31555+
// If atomic AND need to be masking all be one bit and testing the one bit
31556+
// unset in the mask.
3142431557
if (AI->getOperation() == AtomicRMWInst::And)
31425-
return ~C1->getValue() == C2->getValue()
31558+
return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
3142631559
? AtomicExpansionKind::BitTestIntrinsic
3142731560
: AtomicExpansionKind::CmpXChg;
3142831561

31429-
return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
31430-
: AtomicExpansionKind::CmpXChg;
31562+
// If atomic XOR/OR need to be setting and testing the same bit.
31563+
return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31564+
? AtomicExpansionKind::BitTestIntrinsic
31565+
: AtomicExpansionKind::CmpXChg;
3143131566
}
3143231567

3143331568
void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
3143431569
IRBuilder<> Builder(AI);
31435-
Intrinsic::ID IID = Intrinsic::not_intrinsic;
31570+
Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
31571+
Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
3143631572
switch (AI->getOperation()) {
3143731573
default:
3143831574
llvm_unreachable("Unknown atomic operation");
3143931575
case AtomicRMWInst::Or:
31440-
IID = Intrinsic::x86_atomic_bts;
31576+
IID_C = Intrinsic::x86_atomic_bts;
31577+
IID_I = Intrinsic::x86_atomic_bts_rm;
3144131578
break;
3144231579
case AtomicRMWInst::Xor:
31443-
IID = Intrinsic::x86_atomic_btc;
31580+
IID_C = Intrinsic::x86_atomic_btc;
31581+
IID_I = Intrinsic::x86_atomic_btc_rm;
3144431582
break;
3144531583
case AtomicRMWInst::And:
31446-
IID = Intrinsic::x86_atomic_btr;
31584+
IID_C = Intrinsic::x86_atomic_btr;
31585+
IID_I = Intrinsic::x86_atomic_btr_rm;
3144731586
break;
3144831587
}
3144931588
Instruction *I = AI->user_back();
3145031589
LLVMContext &Ctx = AI->getContext();
31451-
unsigned Imm =
31452-
countTrailingZeros(cast<ConstantInt>(I->getOperand(1))->getZExtValue());
31453-
Function *BitTest =
31454-
Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
3145531590
Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
3145631591
Type::getInt8PtrTy(Ctx));
31457-
Value *Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
31592+
Function *BitTest = nullptr;
31593+
Value *Result = nullptr;
31594+
auto BitTested = FindSingleBitChange(AI->getValOperand());
31595+
assert(BitTested.first != nullptr);
31596+
if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31597+
auto *C = dyn_cast<ConstantInt>(I->getOperand(1));
31598+
assert(C != nullptr);
31599+
31600+
BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
31601+
31602+
unsigned Imm = countTrailingZeros(C->getZExtValue());
31603+
Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
31604+
} else {
31605+
BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
31606+
31607+
assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31608+
31609+
Value *SI = BitTested.first;
31610+
assert(SI != nullptr);
31611+
31612+
// BT{S|R|C} on memory operand don't modulo bit position so we need to
31613+
// mask it.
31614+
unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31615+
Value *BitPos =
31616+
Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31617+
// Todo(1): In many cases it may be provable that SI is less than
31618+
// ShiftBits in which case this mask is unnecessary
31619+
// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31620+
// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31621+
// favor of just a raw BT{S|R|C}.
31622+
31623+
Result = Builder.CreateCall(BitTest, {Addr, BitPos});
31624+
Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31625+
31626+
// If the result is only used for zero/non-zero status then we don't need to
31627+
// shift value back. Otherwise do so.
31628+
for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31629+
if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31630+
if (ICmp->isEquality()) {
31631+
auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31632+
auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31633+
if (C0 || C1) {
31634+
assert(C0 == nullptr || C1 == nullptr);
31635+
if ((C0 ? C0 : C1)->isZero())
31636+
continue;
31637+
}
31638+
}
31639+
}
31640+
Result = Builder.CreateShl(Result, BitPos);
31641+
break;
31642+
}
31643+
}
31644+
3145831645
I->replaceAllUsesWith(Result);
3145931646
I->eraseFromParent();
3146031647
AI->eraseFromParent();
@@ -34242,6 +34429,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3424234429
NODE_NAME_CASE(LBTS)
3424334430
NODE_NAME_CASE(LBTC)
3424434431
NODE_NAME_CASE(LBTR)
34432+
NODE_NAME_CASE(LBTS_RM)
34433+
NODE_NAME_CASE(LBTC_RM)
34434+
NODE_NAME_CASE(LBTR_RM)
3424534435
NODE_NAME_CASE(AADD)
3424634436
NODE_NAME_CASE(AOR)
3424734437
NODE_NAME_CASE(AXOR)

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,9 @@ namespace llvm {
798798
LBTS,
799799
LBTC,
800800
LBTR,
801+
LBTS_RM,
802+
LBTC_RM,
803+
LBTR_RM,
801804

802805
/// RAO arithmetic instructions.
803806
/// OUTCHAIN = AADD(INCHAIN, PTR, RHS)

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,17 @@ def x86btc : SDNode<"X86ISD::LBTC", X86LBTest,
864864
def x86btr : SDNode<"X86ISD::LBTR", X86LBTest,
865865
[SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
866866

867+
def X86LBTestRM : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
868+
SDTCisInt<2>]>;
869+
870+
def x86_rm_bts : SDNode<"X86ISD::LBTS_RM", X86LBTestRM,
871+
[SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
872+
def x86_rm_btc : SDNode<"X86ISD::LBTC_RM", X86LBTestRM,
873+
[SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
874+
def x86_rm_btr : SDNode<"X86ISD::LBTR_RM", X86LBTestRM,
875+
[SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
876+
877+
867878
multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
868879
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
869880
SchedRW = [WriteBitTestSetRegRMW] in {
@@ -882,10 +893,33 @@ multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
882893
}
883894
}
884895

896+
multiclass ATOMIC_LOGIC_OP_RM<bits<8> Opc8, string s> {
897+
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
898+
SchedRW = [WriteBitTestSetRegRMW] in {
899+
def 16rm : Ii8<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
900+
!strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
901+
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR16:$src2))]>,
902+
OpSize16, TB, LOCK;
903+
def 32rm : Ii8<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
904+
!strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
905+
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR32:$src2))]>,
906+
OpSize32, TB, LOCK;
907+
def 64rm : RIi8<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
908+
!strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
909+
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR64:$src2))]>,
910+
TB, LOCK;
911+
}
912+
}
913+
914+
885915
defm LOCK_BTS : ATOMIC_LOGIC_OP<MRM5m, "bts">;
886916
defm LOCK_BTC : ATOMIC_LOGIC_OP<MRM7m, "btc">;
887917
defm LOCK_BTR : ATOMIC_LOGIC_OP<MRM6m, "btr">;
888918

919+
defm LOCK_BTS_RM : ATOMIC_LOGIC_OP_RM<0xAB, "bts">;
920+
defm LOCK_BTC_RM : ATOMIC_LOGIC_OP_RM<0xBB, "btc">;
921+
defm LOCK_BTR_RM : ATOMIC_LOGIC_OP_RM<0xB3, "btr">;
922+
889923
// Atomic compare and swap.
890924
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
891925
string mnemonic, SDPatternOperator frag> {

0 commit comments

Comments
 (0)