@@ -16607,9 +16607,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1660716607
1660816608TargetLowering::AtomicExpansionKind
1660916609SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16610- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16611- ? AtomicExpansionKind::NotAtomic
16612- : AtomicExpansionKind::None;
16610+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16611+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16612+ return AtomicExpansionKind::NotAtomic;
16613+
16614+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16615+ return AtomicExpansionKind::None;
16616+
16617+ const DataLayout &DL = CmpX->getDataLayout();
16618+
16619+ Type *ValTy = CmpX->getNewValOperand()->getType();
16620+
16621+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16622+ // atomic in the private case.
16623+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16624+ : AtomicExpansionKind::None;
1661316625}
1661416626
1661516627const TargetRegisterClass *
@@ -16773,40 +16785,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1677316785 return false;
1677416786}
1677516787
16776- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16777- AtomicRMWInst::BinOp Op = AI->getOperation();
16778-
16779- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16780- Op == AtomicRMWInst::Xor) {
16781- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16782- ConstVal && ConstVal->isNullValue()) {
16783- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16784- AI->setOperation(AtomicRMWInst::Add);
16785-
16786- // TODO: Turn the below private handling into a no-op for idempotent
16787- // cases.
16788- }
16789- }
16790-
16791- // The non-flat expansions should only perform the de-canonicalization of
16792- // identity values.
16793- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16794- return;
16795-
16796- // FullFlatEmulation is true if we need to issue the private, shared, and
16797- // global cases.
16798- //
16799- // If this is false, we are only dealing with the flat-targeting-private case,
16800- // where we only insert a check for private and still use the flat instruction
16801- // for global and shared.
16802-
16803- // TODO: Avoid the private check for the fadd case depending on
16804- // noalias.addrspace.
16805-
16806- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16807- Subtarget->hasAtomicFaddInsts() &&
16808- AI->getType()->isFloatTy();
16809-
16788+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16789+ Instruction *AI) const {
1681016790 // Given: atomicrmw fadd ptr %addr, float %val ordering
1681116791 //
1681216792 // With this expansion we produce the following code:
@@ -16853,6 +16833,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685316833 IRBuilder<> Builder(AI);
1685416834 LLVMContext &Ctx = Builder.getContext();
1685516835
16836+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16837+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16838+ : AtomicCmpXchgInst::getPointerOperandIndex();
16839+ Value *Addr = AI->getOperand(PtrOpIdx);
16840+
16841+ /// TODO: Only need to check private, then emit flat-known-not private (no
16842+ /// need for shared block, or cast to global).
16843+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16844+
16845+ Align Alignment;
16846+ if (RMW)
16847+ Alignment = RMW->getAlign();
16848+ else if (CX)
16849+ Alignment = CX->getAlign();
16850+ else
16851+ llvm_unreachable("unhandled atomic operation");
16852+
16853+ // FullFlatEmulation is true if we need to issue the private, shared, and
16854+ // global cases.
16855+ //
16856+ // If this is false, we are only dealing with the flat-targeting-private case,
16857+ // where we only insert a check for private and still use the flat instruction
16858+ // for global and shared.
16859+
16860+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16861+ Subtarget->hasAtomicFaddInsts() &&
16862+ RMW->getType()->isFloatTy();
16863+
1685616864 // If the return value isn't used, do not introduce a false use in the phi.
1685716865 bool ReturnValueIsUsed = !AI->use_empty();
1685816866
@@ -16874,11 +16882,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687416882 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1687516883 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1687616884
16877- Value *Val = AI->getValOperand();
16878- Type *ValTy = Val->getType();
16879- Value *Addr = AI->getPointerOperand();
16880- Align Alignment = AI->getAlign();
16881-
1688216885 std::prev(BB->end())->eraseFromParent();
1688316886 Builder.SetInsertPoint(BB);
1688416887
@@ -16893,8 +16896,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1689316896
1689416897 Instruction *Clone = AI->clone();
1689516898 Clone->insertInto(SharedBB, SharedBB->end());
16896- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897- .set(CastToLocal);
16899+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1689816900 LoadedShared = Clone;
1689916901
1690016902 Builder.CreateBr(PhiBB);
@@ -16906,14 +16908,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1690616908 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1690716909
1690816910 Builder.SetInsertPoint(PrivateBB);
16911+
1690916912 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1691016913 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16911- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16912- Alignment, "loaded.private");
1691316914
16914- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16915+ Value *LoadedPrivate;
16916+ if (RMW) {
16917+ LoadedPrivate = Builder.CreateAlignedLoad(
16918+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16919+
16920+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16921+ LoadedPrivate, RMW->getValOperand());
16922+
16923+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16924+ } else {
16925+ auto [ResultLoad, Equal] =
16926+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16927+ CX->getNewValOperand(), CX->getAlign());
16928+
16929+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16930+ ResultLoad, 0);
16931+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16932+ }
1691516933
16916- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1691716934 Builder.CreateBr(PhiBB);
1691816935
1691916936 Builder.SetInsertPoint(GlobalBB);
@@ -16923,8 +16940,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1692316940 if (FullFlatEmulation) {
1692416941 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1692516942 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16926- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16927- .set(CastToGlobal);
16943+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1692816944 }
1692916945
1693016946 AI->removeFromParent();
@@ -16948,7 +16964,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1694816964 Builder.SetInsertPoint(PhiBB);
1694916965
1695016966 if (ReturnValueIsUsed) {
16951- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16967+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1695216968 AI->replaceAllUsesWith(Loaded);
1695316969 if (FullFlatEmulation)
1695416970 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16960,6 +16976,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1696016976 Builder.CreateBr(ExitBB);
1696116977}
1696216978
16979+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16980+ AtomicRMWInst::BinOp Op = AI->getOperation();
16981+
16982+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16983+ Op == AtomicRMWInst::Xor) {
16984+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16985+ ConstVal && ConstVal->isNullValue()) {
16986+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16987+ AI->setOperation(AtomicRMWInst::Add);
16988+
16989+ // We may still need the private-alias-flat handling below.
16990+
16991+ // TODO: Skip this for cases where we cannot access remote memory.
16992+ }
16993+ }
16994+
16995+ // The non-flat expansions should only perform the de-canonicalization of
16996+ // identity values.
16997+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16998+ return;
16999+
17000+ emitExpandAtomicAddrSpacePredicate(AI);
17001+ }
17002+
17003+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17004+ emitExpandAtomicAddrSpacePredicate(CI);
17005+ }
17006+
1696317007LoadInst *
1696417008SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1696517009 IRBuilder<> Builder(AI);
0 commit comments