@@ -16497,9 +16497,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1649716497
1649816498TargetLowering::AtomicExpansionKind
1649916499SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16500- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16501- ? AtomicExpansionKind::NotAtomic
16502- : AtomicExpansionKind::None;
16500+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16501+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16502+ return AtomicExpansionKind::NotAtomic;
16503+
16504+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16505+ return AtomicExpansionKind::None;
16506+
16507+ const DataLayout &DL = CmpX->getDataLayout();
16508+
16509+ Type *ValTy = CmpX->getNewValOperand()->getType();
16510+
16511+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16512+ // atomic in the private case.
16513+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16514+ : AtomicExpansionKind::None;
1650316515}
1650416516
1650516517const TargetRegisterClass *
@@ -16663,40 +16675,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1666316675 return false;
1666416676}
1666516677
16666- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16667- AtomicRMWInst::BinOp Op = AI->getOperation();
16668-
16669- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16670- Op == AtomicRMWInst::Xor) {
16671- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672- ConstVal && ConstVal->isNullValue()) {
16673- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674- AI->setOperation(AtomicRMWInst::Add);
16675-
16676- // TODO: Turn the below private handling into a no-op for idempotent
16677- // cases.
16678- }
16679- }
16680-
16681- // The non-flat expansions should only perform the de-canonicalization of
16682- // identity values.
16683- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684- return;
16685-
16686- // FullFlatEmulation is true if we need to issue the private, shared, and
16687- // global cases.
16688- //
16689- // If this is false, we are only dealing with the flat-targeting-private case,
16690- // where we only insert a check for private and still use the flat instruction
16691- // for global and shared.
16692-
16693- // TODO: Avoid the private check for the fadd case depending on
16694- // noalias.addrspace.
16695-
16696- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697- Subtarget->hasAtomicFaddInsts() &&
16698- AI->getType()->isFloatTy();
16699-
16678+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16679+ Instruction *AI) const {
1670016680 // Given: atomicrmw fadd ptr %addr, float %val ordering
1670116681 //
1670216682 // With this expansion we produce the following code:
@@ -16743,6 +16723,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1674316723 IRBuilder<> Builder(AI);
1674416724 LLVMContext &Ctx = Builder.getContext();
1674516725
16726+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16727+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16728+ : AtomicCmpXchgInst::getPointerOperandIndex();
16729+ Value *Addr = AI->getOperand(PtrOpIdx);
16730+
16731+ /// TODO: Only need to check private, then emit flat-known-not private (no
16732+ /// need for shared block, or cast to global).
16733+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16734+
16735+ Align Alignment;
16736+ if (RMW)
16737+ Alignment = RMW->getAlign();
16738+ else if (CX)
16739+ Alignment = CX->getAlign();
16740+ else
16741+ llvm_unreachable("unhandled atomic operation");
16742+
16743+ // FullFlatEmulation is true if we need to issue the private, shared, and
16744+ // global cases.
16745+ //
16746+ // If this is false, we are only dealing with the flat-targeting-private case,
16747+ // where we only insert a check for private and still use the flat instruction
16748+ // for global and shared.
16749+
16750+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16751+ Subtarget->hasAtomicFaddInsts() &&
16752+ RMW->getType()->isFloatTy();
16753+
1674616754 // If the return value isn't used, do not introduce a false use in the phi.
1674716755 bool ReturnValueIsUsed = !AI->use_empty();
1674816756
@@ -16764,11 +16772,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676416772 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1676516773 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1676616774
16767- Value *Val = AI->getValOperand();
16768- Type *ValTy = Val->getType();
16769- Value *Addr = AI->getPointerOperand();
16770- Align Alignment = AI->getAlign();
16771-
1677216775 std::prev(BB->end())->eraseFromParent();
1677316776 Builder.SetInsertPoint(BB);
1677416777
@@ -16783,8 +16786,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678316786
1678416787 Instruction *Clone = AI->clone();
1678516788 Clone->insertInto(SharedBB, SharedBB->end());
16786- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787- .set(CastToLocal);
16789+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1678816790 LoadedShared = Clone;
1678916791
1679016792 Builder.CreateBr(PhiBB);
@@ -16796,14 +16798,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679616798 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1679716799
1679816800 Builder.SetInsertPoint(PrivateBB);
16801+
1679916802 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1680016803 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16801- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16802- Alignment, "loaded.private");
1680316804
16804- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16805+ Value *LoadedPrivate;
16806+ if (RMW) {
16807+ LoadedPrivate = Builder.CreateAlignedLoad(
16808+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16809+
16810+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16811+ LoadedPrivate, RMW->getValOperand());
16812+
16813+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16814+ } else {
16815+ auto [ResultLoad, Equal] =
16816+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16817+ CX->getNewValOperand(), CX->getAlign());
16818+
16819+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16820+ ResultLoad, 0);
16821+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16822+ }
1680516823
16806- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1680716824 Builder.CreateBr(PhiBB);
1680816825
1680916826 Builder.SetInsertPoint(GlobalBB);
@@ -16813,8 +16830,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1681316830 if (FullFlatEmulation) {
1681416831 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1681516832 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817- .set(CastToGlobal);
16833+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1681816834 }
1681916835
1682016836 AI->removeFromParent();
@@ -16838,7 +16854,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1683816854 Builder.SetInsertPoint(PhiBB);
1683916855
1684016856 if (ReturnValueIsUsed) {
16841- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16857+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1684216858 AI->replaceAllUsesWith(Loaded);
1684316859 if (FullFlatEmulation)
1684416860 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16850,6 +16866,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685016866 Builder.CreateBr(ExitBB);
1685116867}
1685216868
16869+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16870+ AtomicRMWInst::BinOp Op = AI->getOperation();
16871+
16872+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16873+ Op == AtomicRMWInst::Xor) {
16874+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16875+ ConstVal && ConstVal->isNullValue()) {
16876+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16877+ AI->setOperation(AtomicRMWInst::Add);
16878+
16879+ // We may still need the private-alias-flat handling below.
16880+
16881+ // TODO: Skip this for cases where we cannot access remote memory.
16882+ }
16883+ }
16884+
16885+ // The non-flat expansions should only perform the de-canonicalization of
16886+ // identity values.
16887+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16888+ return;
16889+
16890+ emitExpandAtomicAddrSpacePredicate(AI);
16891+ }
16892+
16893+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16894+ emitExpandAtomicAddrSpacePredicate(CI);
16895+ }
16896+
1685316897LoadInst *
1685416898SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1685516899 IRBuilder<> Builder(AI);
0 commit comments