@@ -16318,15 +16318,14 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
1631816318 ++I) {
1631916319 auto *Low = mdconst::extract<ConstantInt>(
1632016320 NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16321- auto *High = mdconst::extract<ConstantInt>(
16322- NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16323-
16324- if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16325- High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16326- return true;
16321+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16322+ auto *High = mdconst::extract<ConstantInt>(
16323+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16324+ return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16325+ }
1632716326 }
1632816327
16329- return false ;
16328+ return true ;
1633016329}
1633116330
1633216331TargetLowering::AtomicExpansionKind
@@ -16573,9 +16572,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1657316572
1657416573TargetLowering::AtomicExpansionKind
1657516574SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16576- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16577- ? AtomicExpansionKind::NotAtomic
16578- : AtomicExpansionKind::None;
16575+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16576+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16577+ return AtomicExpansionKind::NotAtomic;
16578+
16579+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16580+ return AtomicExpansionKind::None;
16581+
16582+ const DataLayout &DL = CmpX->getDataLayout();
16583+
16584+ Type *ValTy = CmpX->getNewValOperand()->getType();
16585+
16586+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16587+ // atomic in the private case.
16588+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16589+ : AtomicExpansionKind::None;
1657916590}
1658016591
1658116592const TargetRegisterClass *
@@ -16741,40 +16752,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1674116752 return false;
1674216753}
1674316754
16744- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16745- AtomicRMWInst::BinOp Op = AI->getOperation();
16746-
16747- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16748- Op == AtomicRMWInst::Xor) {
16749- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16750- ConstVal && ConstVal->isNullValue()) {
16751- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16752- AI->setOperation(AtomicRMWInst::Add);
16753-
16754- // TODO: Turn the below private handling into a no-op for idempotent
16755- // cases.
16756- }
16757- }
16758-
16759- // The non-flat expansions should only perform the de-canonicalization of
16760- // identity values.
16761- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16762- return;
16763-
16764- // FullFlatEmulation is true if we need to issue the private, shared, and
16765- // global cases.
16766- //
16767- // If this is false, we are only dealing with the flat-targeting-private case,
16768- // where we only insert a check for private and still use the flat instruction
16769- // for global and shared.
16770-
16771- // TODO: Avoid the private check for the fadd case depending on
16772- // noalias.addrspace.
16773-
16774- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16775- Subtarget->hasAtomicFaddInsts() &&
16776- AI->getType()->isFloatTy();
16777-
16755+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16756+ Instruction *AI) const {
1677816757 // Given: atomicrmw fadd ptr %addr, float %val ordering
1677916758 //
1678016759 // With this expansion we produce the following code:
@@ -16821,6 +16800,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682116800 IRBuilder<> Builder(AI);
1682216801 LLVMContext &Ctx = Builder.getContext();
1682316802
16803+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16804+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16805+ : AtomicCmpXchgInst::getPointerOperandIndex();
16806+ Value *Addr = AI->getOperand(PtrOpIdx);
16807+
16808+ /// TODO: Only need to check private, then emit flat-known-not private (no
16809+ /// need for shared block, or cast to global).
16810+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16811+
16812+ Align Alignment;
16813+ if (RMW)
16814+ Alignment = RMW->getAlign();
16815+ else if (CX)
16816+ Alignment = CX->getAlign();
16817+ else
16818+ llvm_unreachable("unhandled atomic operation");
16819+
16820+ // FullFlatEmulation is true if we need to issue the private, shared, and
16821+ // global cases.
16822+ //
16823+ // If this is false, we are only dealing with the flat-targeting-private case,
16824+ // where we only insert a check for private and still use the flat instruction
16825+ // for global and shared.
16826+
16827+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16828+ Subtarget->hasAtomicFaddInsts() &&
16829+ RMW->getType()->isFloatTy();
16830+
1682416831 // If the return value isn't used, do not introduce a false use in the phi.
1682516832 bool ReturnValueIsUsed = !AI->use_empty();
1682616833
@@ -16842,11 +16849,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684216849 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1684316850 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1684416851
16845- Value *Val = AI->getValOperand();
16846- Type *ValTy = Val->getType();
16847- Value *Addr = AI->getPointerOperand();
16848- Align Alignment = AI->getAlign();
16849-
1685016852 std::prev(BB->end())->eraseFromParent();
1685116853 Builder.SetInsertPoint(BB);
1685216854
@@ -16861,8 +16863,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1686116863
1686216864 Instruction *Clone = AI->clone();
1686316865 Clone->insertInto(SharedBB, SharedBB->end());
16864- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16865- .set(CastToLocal);
16866+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1686616867 LoadedShared = Clone;
1686716868
1686816869 Builder.CreateBr(PhiBB);
@@ -16874,14 +16875,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687416875 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1687516876
1687616877 Builder.SetInsertPoint(PrivateBB);
16878+
1687716879 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1687816880 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16879- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16880- Alignment, "loaded.private");
1688116881
16882- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16882+ Value *LoadedPrivate;
16883+ if (RMW) {
16884+ LoadedPrivate = Builder.CreateAlignedLoad(
16885+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16886+
16887+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16888+ LoadedPrivate, RMW->getValOperand());
16889+
16890+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16891+ } else {
16892+ auto [ResultLoad, Equal] =
16893+ buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16894+ CX->getNewValOperand(), CX->getAlign());
16895+
16896+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16897+ ResultLoad, 0);
16898+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16899+ }
1688316900
16884- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1688516901 Builder.CreateBr(PhiBB);
1688616902
1688716903 Builder.SetInsertPoint(GlobalBB);
@@ -16891,8 +16907,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1689116907 if (FullFlatEmulation) {
1689216908 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1689316909 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16894- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16895- .set(CastToGlobal);
16910+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1689616911 }
1689716912
1689816913 AI->removeFromParent();
@@ -16916,7 +16931,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1691616931 Builder.SetInsertPoint(PhiBB);
1691716932
1691816933 if (ReturnValueIsUsed) {
16919- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16934+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
1692016935 AI->replaceAllUsesWith(Loaded);
1692116936 if (FullFlatEmulation)
1692216937 Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16928,6 +16943,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1692816943 Builder.CreateBr(ExitBB);
1692916944}
1693016945
16946+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16947+ AtomicRMWInst::BinOp Op = AI->getOperation();
16948+
16949+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16950+ Op == AtomicRMWInst::Xor) {
16951+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16952+ ConstVal && ConstVal->isNullValue()) {
16953+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16954+ AI->setOperation(AtomicRMWInst::Add);
16955+
16956+ // We may still need the private-alias-flat handling below.
16957+
16958+ // TODO: Skip this for cases where we cannot access remote memory.
16959+ }
16960+ }
16961+
16962+ // The non-flat expansions should only perform the de-canonicalization of
16963+ // identity values.
16964+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16965+ return;
16966+
16967+ emitExpandAtomicAddrSpacePredicate(AI);
16968+ }
16969+
16970+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16971+ emitExpandAtomicAddrSpacePredicate(CI);
16972+ }
16973+
1693116974LoadInst *
1693216975SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1693316976 IRBuilder<> Builder(AI);
0 commit comments