@@ -16577,9 +16577,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1657716577
1657816578TargetLowering::AtomicExpansionKind
1657916579SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16580- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16581- ? AtomicExpansionKind::NotAtomic
16582- : AtomicExpansionKind::None;
16580+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16581+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16582+ return AtomicExpansionKind::NotAtomic;
16583+
16584+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16585+ return AtomicExpansionKind::None;
16586+
16587+ const DataLayout &DL = CmpX->getDataLayout();
16588+
16589+ Type *ValTy = CmpX->getNewValOperand()->getType();
16590+
16591+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16592+ // atomic in the private case.
16593+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16594+ : AtomicExpansionKind::None;
1658316595}
1658416596
1658516597const TargetRegisterClass *
@@ -16745,40 +16757,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1674516757 return false;
1674616758}
1674716759
16748- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16749- AtomicRMWInst::BinOp Op = AI->getOperation();
16750-
16751- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16752- Op == AtomicRMWInst::Xor) {
16753- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16754- ConstVal && ConstVal->isNullValue()) {
16755- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16756- AI->setOperation(AtomicRMWInst::Add);
16757-
16758- // TODO: Turn the below private handling into a no-op for idempotent
16759- // cases.
16760- }
16761- }
16762-
16763- // The non-flat expansions should only perform the de-canonicalization of
16764- // identity values.
16765- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16766- return;
16767-
16768- // FullFlatEmulation is true if we need to issue the private, shared, and
16769- // global cases.
16770- //
16771- // If this is false, we are only dealing with the flat-targeting-private case,
16772- // where we only insert a check for private and still use the flat instruction
16773- // for global and shared.
16774-
16775- // TODO: Avoid the private check for the fadd case depending on
16776- // noalias.addrspace.
16777-
16778- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16779- Subtarget->hasAtomicFaddInsts() &&
16780- AI->getType()->isFloatTy();
16781-
16760+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16761+ Instruction *AI) const {
1678216762 // Given: atomicrmw fadd ptr %addr, float %val ordering
1678316763 //
1678416764 // With this expansion we produce the following code:
@@ -16825,6 +16805,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682516805 IRBuilder<> Builder(AI);
1682616806 LLVMContext &Ctx = Builder.getContext();
1682716807
16808+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16809+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16810+ : AtomicCmpXchgInst::getPointerOperandIndex();
16811+ Value *Addr = AI->getOperand(PtrOpIdx);
16812+
16813+ /// TODO: Only need to check private, then emit flat-known-not private (no
16814+ /// need for shared block, or cast to global).
16815+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16816+
16817+ Align Alignment;
16818+ if (RMW)
16819+ Alignment = RMW->getAlign();
16820+ else if (CX)
16821+ Alignment = CX->getAlign();
16822+ else
16823+ llvm_unreachable("unhandled atomic operation");
16824+
16825+ // FullFlatEmulation is true if we need to issue the private, shared, and
16826+ // global cases.
16827+ //
16828+ // If this is false, we are only dealing with the flat-targeting-private case,
16829+ // where we only insert a check for private and still use the flat instruction
16830+ // for global and shared.
16831+
16832+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16833+ Subtarget->hasAtomicFaddInsts() &&
16834+ RMW->getType()->isFloatTy();
16835+
1682816836 // If the return value isn't used, do not introduce a false use in the phi.
1682916837 bool ReturnValueIsUsed = !AI->use_empty();
1683016838
@@ -16846,11 +16854,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684616854 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1684716855 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1684816856
16849- Value *Val = AI->getValOperand();
16850- Type *ValTy = Val->getType();
16851- Value *Addr = AI->getPointerOperand();
16852- Align Alignment = AI->getAlign();
16853-
1685416857 std::prev(BB->end())->eraseFromParent();
1685516858 Builder.SetInsertPoint(BB);
1685616859
@@ -16865,8 +16868,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1686516868
1686616869 Instruction *Clone = AI->clone();
1686716870 Clone->insertInto(SharedBB, SharedBB->end());
16868- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16869- .set(CastToLocal);
16871+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1687016872 LoadedShared = Clone;
1687116873
1687216874 Builder.CreateBr(PhiBB);
@@ -16878,14 +16880,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687816880 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1687916881
1688016882 Builder.SetInsertPoint(PrivateBB);
16883+
1688116884 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1688216885 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16883- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16884- Alignment, "loaded.private");
1688516886
16886- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16887+ Value *LoadedPrivate;
16888+ if (RMW) {
16889+ LoadedPrivate = Builder.CreateAlignedLoad(
16890+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16891+
16892+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16893+ LoadedPrivate, RMW->getValOperand());
16894+
16895+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16896+ } else {
16897+ auto [ResultLoad, Equal] =
16898+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16899+ CX->getNewValOperand(), CX->getAlign());
16900+
16901+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16902+ ResultLoad, 0);
16903+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16904+ }
1688716905
16888- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1688916906 Builder.CreateBr(PhiBB);
1689016907
1689116908 Builder.SetInsertPoint(GlobalBB);
@@ -16895,8 +16912,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1689516912 if (FullFlatEmulation) {
1689616913 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1689716914 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16898- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16899- .set(CastToGlobal);
16915+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1690016916 }
1690116917
1690216918 AI->removeFromParent();
@@ -16920,7 +16936,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1692016936 Builder.SetInsertPoint(PhiBB);
1692116937
1692216938 if (ReturnValueIsUsed) {
16923- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16939+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1692416940 AI->replaceAllUsesWith(Loaded);
1692516941 if (FullFlatEmulation)
1692616942 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16932,6 +16948,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1693216948 Builder.CreateBr(ExitBB);
1693316949}
1693416950
16951+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16952+ AtomicRMWInst::BinOp Op = AI->getOperation();
16953+
16954+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16955+ Op == AtomicRMWInst::Xor) {
16956+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16957+ ConstVal && ConstVal->isNullValue()) {
16958+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16959+ AI->setOperation(AtomicRMWInst::Add);
16960+
16961+ // We may still need the private-alias-flat handling below.
16962+
16963+ // TODO: Skip this for cases where we cannot access remote memory.
16964+ }
16965+ }
16966+
16967+ // The non-flat expansions should only perform the de-canonicalization of
16968+ // identity values.
16969+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16970+ return;
16971+
16972+ emitExpandAtomicAddrSpacePredicate(AI);
16973+ }
16974+
16975+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16976+ emitExpandAtomicAddrSpacePredicate(CI);
16977+ }
16978+
1693516979LoadInst *
1693616980SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1693716981 IRBuilder<> Builder(AI);
0 commit comments