3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16243,12 +16244,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1624316244 : TargetLowering::AtomicExpansionKind::CmpXChg;
1624416245}
1624516246
16247+ /// Return if a flat address space atomicrmw can access private memory.
16248+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16249+ const MDNode *NoaliasAddrSpaceMD =
16250+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16251+ if (!NoaliasAddrSpaceMD)
16252+ return true;
16253+
16254+ // FIXME: Can this actually fail? Why is this optional?
16255+ if (std::optional<ConstantRange> CR =
16256+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16257+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16258+ }
16259+
16260+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16261+ }
16262+
1624616263TargetLowering::AtomicExpansionKind
1624716264SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1624816265 unsigned AS = RMW->getPointerAddressSpace();
1624916266 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1625016267 return AtomicExpansionKind::NotAtomic;
1625116268
16269+ // 64-bit flat atomics that dynamically reside in private memory will silently
16270+ // be dropped.
16271+ //
16272+ // Note that we will emit a new copy of the original atomic in the expansion,
16273+ // which will be incrementally relegalized.
16274+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16275+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16276+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16277+ flatInstrMayAccessPrivate(RMW))
16278+ return AtomicExpansionKind::Expand;
16279+
1625216280 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1625316281 OptimizationRemarkEmitter ORE(RMW->getFunction());
1625416282 ORE.emit([=]() {
@@ -16647,20 +16675,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1664716675
1664816676 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1664916677 Op == AtomicRMWInst::Xor) {
16650- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16651- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16652- "this cannot be replaced with add");
16653- AI->setOperation(AtomicRMWInst::Add);
16654- return;
16678+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16679+ ConstVal && ConstVal->isNullValue()) {
16680+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16681+ AI->setOperation(AtomicRMWInst::Add);
16682+
16683+ // TODO: Turn the below private handling into a no-op for idempotent
16684+ // cases.
16685+ }
1665516686 }
1665616687
16657- assert(Subtarget->hasAtomicFaddInsts() &&
16658- "target should have atomic fadd instructions");
16659- assert(AI->getType()->isFloatTy() &&
16660- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16661- "generic atomicrmw expansion only supports FP32 operand in flat "
16662- "address space");
16663- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16688+ // The non-flat expansions should only perform the de-canonicalization of
16689+ // identity values.
16690+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16691+ return;
16692+
16693+ // FullFlatEmulation is true if we need to issue the private, shared, and
16694+ // global cases.
16695+ //
16696+ // If this is false, we are only dealing with the flat-targeting-private case,
16697+ // where we only insert a check for private and still use the flat instruction
16698+ // for global and shared.
16699+
16700+ // TODO: Avoid the private check for the fadd case depending on
16701+ // noalias.addrspace.
16702+
16703+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16704+ Subtarget->hasAtomicFaddInsts() &&
16705+ AI->getType()->isFloatTy();
1666416706
1666516707 // Given: atomicrmw fadd ptr %addr, float %val ordering
1666616708 //
@@ -16700,6 +16742,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1670016742 //
1670116743 // atomicrmw.end:
1670216744 // [...]
16745+ //
16746+ //
16747+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16748+ // version that only inserts the private check, and uses the flat operation.
1670316749
1670416750 IRBuilder<> Builder(AI);
1670516751 LLVMContext &Ctx = Builder.getContext();
@@ -16711,9 +16757,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671116757 Function *F = BB->getParent();
1671216758 BasicBlock *ExitBB =
1671316759 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16714- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16715- BasicBlock *CheckPrivateBB =
16716- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760+ BasicBlock *SharedBB = nullptr;
16761+
16762+ BasicBlock *CheckPrivateBB = BB;
16763+ if (FullFlatEmulation) {
16764+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16765+ CheckPrivateBB =
16766+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16767+ }
16768+
1671716769 BasicBlock *PrivateBB =
1671816770 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1671916771 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16726,23 +16778,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1672616778
1672716779 std::prev(BB->end())->eraseFromParent();
1672816780 Builder.SetInsertPoint(BB);
16729- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16730- {Addr}, nullptr, "is.shared");
16731- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1673216781
16733- Builder.SetInsertPoint(SharedBB);
16734- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16735- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16782+ Value *LoadedShared = nullptr;
16783+ if (FullFlatEmulation) {
16784+ CallInst *IsShared = Builder.CreateIntrinsic(
16785+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16786+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16787+ Builder.SetInsertPoint(SharedBB);
16788+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16789+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1673616790
16737- Instruction *Clone = AI->clone();
16738- Clone->insertInto(SharedBB, SharedBB->end());
16739- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16740- .set(CastToLocal);
16741- Instruction * LoadedShared = Clone;
16791+ Instruction *Clone = AI->clone();
16792+ Clone->insertInto(SharedBB, SharedBB->end());
16793+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16794+ .set(CastToLocal);
16795+ LoadedShared = Clone;
1674216796
16743- Builder.CreateBr(PhiBB);
16797+ Builder.CreateBr(PhiBB);
16798+ Builder.SetInsertPoint(CheckPrivateBB);
16799+ }
1674416800
16745- Builder.SetInsertPoint(CheckPrivateBB);
1674616801 CallInst *IsPrivate = Builder.CreateIntrinsic(
1674716802 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1674816803 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16759,23 +16814,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1675916814 Builder.CreateBr(PhiBB);
1676016815
1676116816 Builder.SetInsertPoint(GlobalBB);
16762- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16763- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16764- Value *LoadedGlobal = AI;
1676516817
16766- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16818+ // Continue using a flat instruction if we only emitted the check for private.
16819+ Instruction *LoadedGlobal = AI;
16820+ if (FullFlatEmulation) {
16821+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16822+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16823+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824+ .set(CastToGlobal);
16825+ }
1676716826
1676816827 AI->removeFromParent();
1676916828 AI->insertInto(GlobalBB, GlobalBB->end());
1677016829
16830+ // The new atomicrmw may go through another round of legalization later.
16831+ if (!FullFlatEmulation) {
16832+ // We inserted the runtime check already, make sure we do not try to
16833+ // re-expand this.
16834+ // TODO: Should union with any existing metadata.
16835+ MDBuilder MDB(F->getContext());
16836+ MDNode *RangeNotPrivate =
16837+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16838+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16839+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16840+ RangeNotPrivate);
16841+ }
16842+
1677116843 Builder.CreateBr(PhiBB);
1677216844
1677316845 Builder.SetInsertPoint(PhiBB);
1677416846
1677516847 if (ReturnValueIsUsed) {
1677616848 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1677716849 AI->replaceAllUsesWith(Loaded);
16778- Loaded->addIncoming(LoadedShared, SharedBB);
16850+ if (FullFlatEmulation)
16851+ Loaded->addIncoming(LoadedShared, SharedBB);
1677916852 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1678016853 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1678116854 Loaded->takeName(AI);
0 commit comments