@@ -16588,9 +16588,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1658816588
1658916589TargetLowering::AtomicExpansionKind
1659016590SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16591- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16592- ? AtomicExpansionKind::NotAtomic
16593- : AtomicExpansionKind::None;
16591+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16592+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16593+ return AtomicExpansionKind::NotAtomic;
16594+
16595+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16596+ return AtomicExpansionKind::None;
16597+
16598+ const DataLayout &DL = CmpX->getDataLayout();
16599+
16600+ Type *ValTy = CmpX->getNewValOperand()->getType();
16601+
16602+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16603+ // atomic in the private case.
16604+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16605+ : AtomicExpansionKind::None;
1659416606}
1659516607
1659616608const TargetRegisterClass *
@@ -16754,40 +16766,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1675416766 return false;
1675516767}
1675616768
16757- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16758- AtomicRMWInst::BinOp Op = AI->getOperation();
16759-
16760- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16761- Op == AtomicRMWInst::Xor) {
16762- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16763- ConstVal && ConstVal->isNullValue()) {
16764- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16765- AI->setOperation(AtomicRMWInst::Add);
16766-
16767- // TODO: Turn the below private handling into a no-op for idempotent
16768- // cases.
16769- }
16770- }
16771-
16772- // The non-flat expansions should only perform the de-canonicalization of
16773- // identity values.
16774- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16775- return;
16776-
16777- // FullFlatEmulation is true if we need to issue the private, shared, and
16778- // global cases.
16779- //
16780- // If this is false, we are only dealing with the flat-targeting-private case,
16781- // where we only insert a check for private and still use the flat instruction
16782- // for global and shared.
16783-
16784- // TODO: Avoid the private check for the fadd case depending on
16785- // noalias.addrspace.
16786-
16787- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16788- Subtarget->hasAtomicFaddInsts() &&
16789- AI->getType()->isFloatTy();
16790-
16769+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16770+ Instruction *AI) const {
1679116771 // Given: atomicrmw fadd ptr %addr, float %val ordering
1679216772 //
1679316773 // With this expansion we produce the following code:
@@ -16834,6 +16814,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1683416814 IRBuilder<> Builder(AI);
1683516815 LLVMContext &Ctx = Builder.getContext();
1683616816
16817+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16818+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16819+ : AtomicCmpXchgInst::getPointerOperandIndex();
16820+ Value *Addr = AI->getOperand(PtrOpIdx);
16821+
16822+ /// TODO: Only need to check private, then emit flat-known-not private (no
16823+ /// need for shared block, or cast to global).
16824+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16825+
16826+ Align Alignment;
16827+ if (RMW)
16828+ Alignment = RMW->getAlign();
16829+ else if (CX)
16830+ Alignment = CX->getAlign();
16831+ else
16832+ llvm_unreachable("unhandled atomic operation");
16833+
16834+ // FullFlatEmulation is true if we need to issue the private, shared, and
16835+ // global cases.
16836+ //
16837+ // If this is false, we are only dealing with the flat-targeting-private case,
16838+ // where we only insert a check for private and still use the flat instruction
16839+ // for global and shared.
16840+
16841+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16842+ Subtarget->hasAtomicFaddInsts() &&
16843+ RMW->getType()->isFloatTy();
16844+
1683716845 // If the return value isn't used, do not introduce a false use in the phi.
1683816846 bool ReturnValueIsUsed = !AI->use_empty();
1683916847
@@ -16855,11 +16863,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685516863 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1685616864 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1685716865
16858- Value *Val = AI->getValOperand();
16859- Type *ValTy = Val->getType();
16860- Value *Addr = AI->getPointerOperand();
16861- Align Alignment = AI->getAlign();
16862-
1686316866 std::prev(BB->end())->eraseFromParent();
1686416867 Builder.SetInsertPoint(BB);
1686516868
@@ -16874,8 +16877,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687416877
1687516878 Instruction *Clone = AI->clone();
1687616879 Clone->insertInto(SharedBB, SharedBB->end());
16877- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16878- .set(CastToLocal);
16880+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1687916881 LoadedShared = Clone;
1688016882
1688116883 Builder.CreateBr(PhiBB);
@@ -16887,14 +16889,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1688716889 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1688816890
1688916891 Builder.SetInsertPoint(PrivateBB);
16892+
1689016893 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1689116894 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16892- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16893- Alignment, "loaded.private");
1689416895
16895- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16896+ Value *LoadedPrivate;
16897+ if (RMW) {
16898+ LoadedPrivate = Builder.CreateAlignedLoad(
16899+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16900+
16901+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16902+ LoadedPrivate, RMW->getValOperand());
16903+
16904+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16905+ } else {
16906+ auto [ResultLoad, Equal] =
16907+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16908+ CX->getNewValOperand(), CX->getAlign());
16909+
16910+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16911+ ResultLoad, 0);
16912+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16913+ }
1689616914
16897- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1689816915 Builder.CreateBr(PhiBB);
1689916916
1690016917 Builder.SetInsertPoint(GlobalBB);
@@ -16904,8 +16921,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1690416921 if (FullFlatEmulation) {
1690516922 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1690616923 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16907- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16908- .set(CastToGlobal);
16924+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1690916925 }
1691016926
1691116927 AI->removeFromParent();
@@ -16929,7 +16945,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1692916945 Builder.SetInsertPoint(PhiBB);
1693016946
1693116947 if (ReturnValueIsUsed) {
16932- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16948+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1693316949 AI->replaceAllUsesWith(Loaded);
1693416950 if (FullFlatEmulation)
1693516951 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16941,6 +16957,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1694116957 Builder.CreateBr(ExitBB);
1694216958}
1694316959
16960+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16961+ AtomicRMWInst::BinOp Op = AI->getOperation();
16962+
16963+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16964+ Op == AtomicRMWInst::Xor) {
16965+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16966+ ConstVal && ConstVal->isNullValue()) {
16967+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16968+ AI->setOperation(AtomicRMWInst::Add);
16969+
16970+ // We may still need the private-alias-flat handling below.
16971+
16972+ // TODO: Skip this for cases where we cannot access remote memory.
16973+ }
16974+ }
16975+
16976+ // The non-flat expansions should only perform the de-canonicalization of
16977+ // identity values.
16978+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16979+ return;
16980+
16981+ emitExpandAtomicAddrSpacePredicate(AI);
16982+ }
16983+
16984+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16985+ emitExpandAtomicAddrSpacePredicate(CI);
16986+ }
16987+
1694416988LoadInst *
1694516989SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1694616990 IRBuilder<> Builder(AI);
0 commit comments