3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16236,12 +16237,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1623616237 : TargetLowering::AtomicExpansionKind::CmpXChg;
1623716238}
1623816239
16240+ /// Return if a flat address space atomicrmw can access private memory.
16241+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16242+ const MDNode *NoaliasAddrSpaceMD =
16243+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16244+ if (!NoaliasAddrSpaceMD)
16245+ return true;
16246+
16247+ // FIXME: Can this actually fail? Why is this optional?
16248+ if (std::optional<ConstantRange> CR =
16249+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16250+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16251+ }
16252+
16253+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16254+ }
16255+
1623916256TargetLowering::AtomicExpansionKind
1624016257SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1624116258 unsigned AS = RMW->getPointerAddressSpace();
1624216259 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1624316260 return AtomicExpansionKind::NotAtomic;
1624416261
16262+ // 64-bit flat atomics that dynamically reside in private memory will silently
16263+ // be dropped.
16264+ //
16265+ // Note that we will emit a new copy of the original atomic in the expansion,
16266+ // which will be incrementally relegalized.
16267+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16268+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16269+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16270+ flatInstrMayAccessPrivate(RMW))
16271+ return AtomicExpansionKind::Expand;
16272+
1624516273 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1624616274 OptimizationRemarkEmitter ORE(RMW->getFunction());
1624716275 ORE.emit([=]() {
@@ -16640,20 +16668,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1664016668
1664116669 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1664216670 Op == AtomicRMWInst::Xor) {
16643- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16644- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16645- "this cannot be replaced with add");
16646- AI->setOperation(AtomicRMWInst::Add);
16647- return;
16671+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672+ ConstVal && ConstVal->isNullValue()) {
16673+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674+ AI->setOperation(AtomicRMWInst::Add);
16675+
16676+ // TODO: Turn the below private handling into a no-op for idempotent
16677+ // cases.
16678+ }
1664816679 }
1664916680
16650- assert(Subtarget->hasAtomicFaddInsts() &&
16651- "target should have atomic fadd instructions");
16652- assert(AI->getType()->isFloatTy() &&
16653- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16654- "generic atomicrmw expansion only supports FP32 operand in flat "
16655- "address space");
16656- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16681+ // The non-flat expansions should only perform the de-canonicalization of
16682+ // identity values.
16683+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684+ return;
16685+
16686+ // FullFlatEmulation is true if we need to issue the private, shared, and
16687+ // global cases.
16688+ //
16689+ // If this is false, we are only dealing with the flat-targeting-private case,
16690+ // where we only insert a check for private and still use the flat instruction
16691+ // for global and shared.
16692+
16693+ // TODO: Avoid the private check for the fadd case depending on
16694+ // noalias.addrspace.
16695+
16696+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697+ Subtarget->hasAtomicFaddInsts() &&
16698+ AI->getType()->isFloatTy();
1665716699
1665816700 // Given: atomicrmw fadd ptr %addr, float %val ordering
1665916701 //
@@ -16693,6 +16735,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1669316735 //
1669416736 // atomicrmw.end:
1669516737 // [...]
16738+ //
16739+ //
16740+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16741+ // version that only inserts the private check, and uses the flat operation.
1669616742
1669716743 IRBuilder<> Builder(AI);
1669816744 LLVMContext &Ctx = Builder.getContext();
@@ -16704,9 +16750,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1670416750 Function *F = BB->getParent();
1670516751 BasicBlock *ExitBB =
1670616752 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16707- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16708- BasicBlock *CheckPrivateBB =
16709- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16753+ BasicBlock *SharedBB = nullptr;
16754+
16755+ BasicBlock *CheckPrivateBB = BB;
16756+ if (FullFlatEmulation) {
16757+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16758+ CheckPrivateBB =
16759+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760+ }
16761+
1671016762 BasicBlock *PrivateBB =
1671116763 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1671216764 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16719,23 +16771,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671916771
1672016772 std::prev(BB->end())->eraseFromParent();
1672116773 Builder.SetInsertPoint(BB);
16722- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16723- {Addr}, nullptr, "is.shared");
16724- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1672516774
16726- Builder.SetInsertPoint(SharedBB);
16727- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16728- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16775+ Value *LoadedShared = nullptr;
16776+ if (FullFlatEmulation) {
16777+ CallInst *IsShared = Builder.CreateIntrinsic(
16778+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16779+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16780+ Builder.SetInsertPoint(SharedBB);
16781+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16782+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1672916783
16730- Instruction *Clone = AI->clone();
16731- Clone->insertInto(SharedBB, SharedBB->end());
16732- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16733- .set(CastToLocal);
16734- Instruction * LoadedShared = Clone;
16784+ Instruction *Clone = AI->clone();
16785+ Clone->insertInto(SharedBB, SharedBB->end());
16786+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787+ .set(CastToLocal);
16788+ LoadedShared = Clone;
1673516789
16736- Builder.CreateBr(PhiBB);
16790+ Builder.CreateBr(PhiBB);
16791+ Builder.SetInsertPoint(CheckPrivateBB);
16792+ }
1673716793
16738- Builder.SetInsertPoint(CheckPrivateBB);
1673916794 CallInst *IsPrivate = Builder.CreateIntrinsic(
1674016795 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1674116796 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16752,23 +16807,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1675216807 Builder.CreateBr(PhiBB);
1675316808
1675416809 Builder.SetInsertPoint(GlobalBB);
16755- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16756- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16757- Value *LoadedGlobal = AI;
1675816810
16759- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16811+ // Continue using a flat instruction if we only emitted the check for private.
16812+ Instruction *LoadedGlobal = AI;
16813+ if (FullFlatEmulation) {
16814+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16815+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817+ .set(CastToGlobal);
16818+ }
1676016819
1676116820 AI->removeFromParent();
1676216821 AI->insertInto(GlobalBB, GlobalBB->end());
1676316822
16823+ // The new atomicrmw may go through another round of legalization later.
16824+ if (!FullFlatEmulation) {
16825+ // We inserted the runtime check already, make sure we do not try to
16826+ // re-expand this.
16827+ // TODO: Should union with any existing metadata.
16828+ MDBuilder MDB(F->getContext());
16829+ MDNode *RangeNotPrivate =
16830+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16831+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16832+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16833+ RangeNotPrivate);
16834+ }
16835+
1676416836 Builder.CreateBr(PhiBB);
1676516837
1676616838 Builder.SetInsertPoint(PhiBB);
1676716839
1676816840 if (ReturnValueIsUsed) {
1676916841 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1677016842 AI->replaceAllUsesWith(Loaded);
16771- Loaded->addIncoming(LoadedShared, SharedBB);
16843+ if (FullFlatEmulation)
16844+ Loaded->addIncoming(LoadedShared, SharedBB);
1677216845 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1677316846 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1677416847 Loaded->takeName(AI);
0 commit comments