@@ -16504,9 +16504,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1650416504
1650516505TargetLowering::AtomicExpansionKind
1650616506SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16507- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16508- ? AtomicExpansionKind::NotAtomic
16509- : AtomicExpansionKind::None;
16507+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16508+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16509+ return AtomicExpansionKind::NotAtomic;
16510+
16511+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16512+ return AtomicExpansionKind::None;
16513+
16514+ const DataLayout &DL = CmpX->getDataLayout();
16515+
16516+ Type *ValTy = CmpX->getNewValOperand()->getType();
16517+
16518+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16519+ // atomic in the private case.
16520+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16521+ : AtomicExpansionKind::None;
1651016522}
1651116523
1651216524const TargetRegisterClass *
@@ -16670,40 +16682,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1667016682 return false;
1667116683}
1667216684
16673- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16674- AtomicRMWInst::BinOp Op = AI->getOperation();
16675-
16676- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16677- Op == AtomicRMWInst::Xor) {
16678- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16679- ConstVal && ConstVal->isNullValue()) {
16680- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16681- AI->setOperation(AtomicRMWInst::Add);
16682-
16683- // TODO: Turn the below private handling into a no-op for idempotent
16684- // cases.
16685- }
16686- }
16687-
16688- // The non-flat expansions should only perform the de-canonicalization of
16689- // identity values.
16690- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16691- return;
16692-
16693- // FullFlatEmulation is true if we need to issue the private, shared, and
16694- // global cases.
16695- //
16696- // If this is false, we are only dealing with the flat-targeting-private case,
16697- // where we only insert a check for private and still use the flat instruction
16698- // for global and shared.
16699-
16700- // TODO: Avoid the private check for the fadd case depending on
16701- // noalias.addrspace.
16702-
16703- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16704- Subtarget->hasAtomicFaddInsts() &&
16705- AI->getType()->isFloatTy();
16706-
16685+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16686+ Instruction *AI) const {
1670716687 // Given: atomicrmw fadd ptr %addr, float %val ordering
1670816688 //
1670916689 // With this expansion we produce the following code:
@@ -16750,6 +16730,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1675016730 IRBuilder<> Builder(AI);
1675116731 LLVMContext &Ctx = Builder.getContext();
1675216732
16733+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16734+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16735+ : AtomicCmpXchgInst::getPointerOperandIndex();
16736+ Value *Addr = AI->getOperand(PtrOpIdx);
16737+
16738+ /// TODO: Only need to check private, then emit flat-known-not private (no
16739+ /// need for shared block, or cast to global).
16740+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16741+
16742+ Align Alignment;
16743+ if (RMW)
16744+ Alignment = RMW->getAlign();
16745+ else if (CX)
16746+ Alignment = CX->getAlign();
16747+ else
16748+ llvm_unreachable("unhandled atomic operation");
16749+
16750+ // FullFlatEmulation is true if we need to issue the private, shared, and
16751+ // global cases.
16752+ //
16753+ // If this is false, we are only dealing with the flat-targeting-private case,
16754+ // where we only insert a check for private and still use the flat instruction
16755+ // for global and shared.
16756+
16757+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16758+ Subtarget->hasAtomicFaddInsts() &&
16759+ RMW->getType()->isFloatTy();
16760+
1675316761 // If the return value isn't used, do not introduce a false use in the phi.
1675416762 bool ReturnValueIsUsed = !AI->use_empty();
1675516763
@@ -16771,11 +16779,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1677116779 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1677216780 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1677316781
16774- Value *Val = AI->getValOperand();
16775- Type *ValTy = Val->getType();
16776- Value *Addr = AI->getPointerOperand();
16777- Align Alignment = AI->getAlign();
16778-
1677916782 std::prev(BB->end())->eraseFromParent();
1678016783 Builder.SetInsertPoint(BB);
1678116784
@@ -16790,8 +16793,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679016793
1679116794 Instruction *Clone = AI->clone();
1679216795 Clone->insertInto(SharedBB, SharedBB->end());
16793- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16794- .set(CastToLocal);
16796+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1679516797 LoadedShared = Clone;
1679616798
1679716799 Builder.CreateBr(PhiBB);
@@ -16803,14 +16805,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1680316805 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1680416806
1680516807 Builder.SetInsertPoint(PrivateBB);
16808+
1680616809 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1680716810 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16808- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16809- Alignment, "loaded.private");
1681016811
16811- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16812+ Value *LoadedPrivate;
16813+ if (RMW) {
16814+ LoadedPrivate = Builder.CreateAlignedLoad(
16815+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16816+
16817+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16818+ LoadedPrivate, RMW->getValOperand());
16819+
16820+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16821+ } else {
16822+ auto [ResultLoad, Equal] =
16823+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16824+ CX->getNewValOperand(), CX->getAlign());
16825+
16826+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16827+ ResultLoad, 0);
16828+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16829+ }
1681216830
16813- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1681416831 Builder.CreateBr(PhiBB);
1681516832
1681616833 Builder.SetInsertPoint(GlobalBB);
@@ -16820,8 +16837,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682016837 if (FullFlatEmulation) {
1682116838 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1682216839 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16823- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824- .set(CastToGlobal);
16840+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1682516841 }
1682616842
1682716843 AI->removeFromParent();
@@ -16845,7 +16861,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684516861 Builder.SetInsertPoint(PhiBB);
1684616862
1684716863 if (ReturnValueIsUsed) {
16848- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16864+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1684916865 AI->replaceAllUsesWith(Loaded);
1685016866 if (FullFlatEmulation)
1685116867 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16857,6 +16873,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685716873 Builder.CreateBr(ExitBB);
1685816874}
1685916875
16876+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16877+ AtomicRMWInst::BinOp Op = AI->getOperation();
16878+
16879+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16880+ Op == AtomicRMWInst::Xor) {
16881+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16882+ ConstVal && ConstVal->isNullValue()) {
16883+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16884+ AI->setOperation(AtomicRMWInst::Add);
16885+
16886+ // We may still need the private-alias-flat handling below.
16887+
16888+ // TODO: Skip this for cases where we cannot access remote memory.
16889+ }
16890+ }
16891+
16892+ // The non-flat expansions should only perform the de-canonicalization of
16893+ // identity values.
16894+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16895+ return;
16896+
16897+ emitExpandAtomicAddrSpacePredicate(AI);
16898+ }
16899+
16900+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16901+ emitExpandAtomicAddrSpacePredicate(CI);
16902+ }
16903+
1686016904LoadInst *
1686116905SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1686216906 IRBuilder<> Builder(AI);
0 commit comments