@@ -16575,9 +16575,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1657516575
1657616576TargetLowering::AtomicExpansionKind
1657716577SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16578- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16579- ? AtomicExpansionKind::NotAtomic
16580- : AtomicExpansionKind::None;
16578+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16579+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16580+ return AtomicExpansionKind::NotAtomic;
16581+
16582+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16583+ return AtomicExpansionKind::None;
16584+
16585+ const DataLayout &DL = CmpX->getDataLayout();
16586+
16587+ Type *ValTy = CmpX->getNewValOperand()->getType();
16588+
16589+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16590+ // atomic in the private case.
16591+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16592+ : AtomicExpansionKind::None;
1658116593}
1658216594
1658316595const TargetRegisterClass *
@@ -16743,40 +16755,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1674316755 return false;
1674416756}
1674516757
16746- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16747- AtomicRMWInst::BinOp Op = AI->getOperation();
16748-
16749- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16750- Op == AtomicRMWInst::Xor) {
16751- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16752- ConstVal && ConstVal->isNullValue()) {
16753- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16754- AI->setOperation(AtomicRMWInst::Add);
16755-
16756- // TODO: Turn the below private handling into a no-op for idempotent
16757- // cases.
16758- }
16759- }
16760-
16761- // The non-flat expansions should only perform the de-canonicalization of
16762- // identity values.
16763- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16764- return;
16765-
16766- // FullFlatEmulation is true if we need to issue the private, shared, and
16767- // global cases.
16768- //
16769- // If this is false, we are only dealing with the flat-targeting-private case,
16770- // where we only insert a check for private and still use the flat instruction
16771- // for global and shared.
16772-
16773- // TODO: Avoid the private check for the fadd case depending on
16774- // noalias.addrspace.
16775-
16776- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16777- Subtarget->hasAtomicFaddInsts() &&
16778- AI->getType()->isFloatTy();
16779-
16758+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16759+ Instruction *AI) const {
1678016760 // Given: atomicrmw fadd ptr %addr, float %val ordering
1678116761 //
1678216762 // With this expansion we produce the following code:
@@ -16823,6 +16803,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682316803 IRBuilder<> Builder(AI);
1682416804 LLVMContext &Ctx = Builder.getContext();
1682516805
16806+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16807+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16808+ : AtomicCmpXchgInst::getPointerOperandIndex();
16809+ Value *Addr = AI->getOperand(PtrOpIdx);
16810+
16811+ /// TODO: Only need to check private, then emit flat-known-not private (no
16812+ /// need for shared block, or cast to global).
16813+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16814+
16815+ Align Alignment;
16816+ if (RMW)
16817+ Alignment = RMW->getAlign();
16818+ else if (CX)
16819+ Alignment = CX->getAlign();
16820+ else
16821+ llvm_unreachable("unhandled atomic operation");
16822+
16823+ // FullFlatEmulation is true if we need to issue the private, shared, and
16824+ // global cases.
16825+ //
16826+ // If this is false, we are only dealing with the flat-targeting-private case,
16827+ // where we only insert a check for private and still use the flat instruction
16828+ // for global and shared.
16829+
16830+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16831+ Subtarget->hasAtomicFaddInsts() &&
16832+ RMW->getType()->isFloatTy();
16833+
1682616834 // If the return value isn't used, do not introduce a false use in the phi.
1682716835 bool ReturnValueIsUsed = !AI->use_empty();
1682816836
@@ -16844,11 +16852,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684416852 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1684516853 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1684616854
16847- Value *Val = AI->getValOperand();
16848- Type *ValTy = Val->getType();
16849- Value *Addr = AI->getPointerOperand();
16850- Align Alignment = AI->getAlign();
16851-
1685216855 std::prev(BB->end())->eraseFromParent();
1685316856 Builder.SetInsertPoint(BB);
1685416857
@@ -16863,8 +16866,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1686316866
1686416867 Instruction *Clone = AI->clone();
1686516868 Clone->insertInto(SharedBB, SharedBB->end());
16866- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16867- .set(CastToLocal);
16869+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1686816870 LoadedShared = Clone;
1686916871
1687016872 Builder.CreateBr(PhiBB);
@@ -16876,14 +16878,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687616878 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1687716879
1687816880 Builder.SetInsertPoint(PrivateBB);
16881+
1687916882 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1688016883 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16881- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16882- Alignment, "loaded.private");
1688316884
16884- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16885+ Value *LoadedPrivate;
16886+ if (RMW) {
16887+ LoadedPrivate = Builder.CreateAlignedLoad(
16888+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16889+
16890+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16891+ LoadedPrivate, RMW->getValOperand());
16892+
16893+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16894+ } else {
16895+ auto [ResultLoad, Equal] =
16896+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16897+ CX->getNewValOperand(), CX->getAlign());
16898+
16899+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16900+ ResultLoad, 0);
16901+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16902+ }
1688516903
16886- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1688716904 Builder.CreateBr(PhiBB);
1688816905
1688916906 Builder.SetInsertPoint(GlobalBB);
@@ -16893,8 +16910,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1689316910 if (FullFlatEmulation) {
1689416911 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1689516912 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16896- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897- .set(CastToGlobal);
16913+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1689816914 }
1689916915
1690016916 AI->removeFromParent();
@@ -16918,7 +16934,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1691816934 Builder.SetInsertPoint(PhiBB);
1691916935
1692016936 if (ReturnValueIsUsed) {
16921- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16937+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1692216938 AI->replaceAllUsesWith(Loaded);
1692316939 if (FullFlatEmulation)
1692416940 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16930,6 +16946,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1693016946 Builder.CreateBr(ExitBB);
1693116947}
1693216948
16949+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16950+ AtomicRMWInst::BinOp Op = AI->getOperation();
16951+
16952+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16953+ Op == AtomicRMWInst::Xor) {
16954+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16955+ ConstVal && ConstVal->isNullValue()) {
16956+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16957+ AI->setOperation(AtomicRMWInst::Add);
16958+
16959+ // We may still need the private-alias-flat handling below.
16960+
16961+ // TODO: Skip this for cases where we cannot access remote memory.
16962+ }
16963+ }
16964+
16965+ // The non-flat expansions should only perform the de-canonicalization of
16966+ // identity values.
16967+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16968+ return;
16969+
16970+ emitExpandAtomicAddrSpacePredicate(AI);
16971+ }
16972+
16973+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16974+ emitExpandAtomicAddrSpacePredicate(CI);
16975+ }
16976+
1693316977LoadInst *
1693416978SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1693516979 IRBuilder<> Builder(AI);
0 commit comments