3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16327,12 +16328,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1632716328 : TargetLowering::AtomicExpansionKind::CmpXChg;
1632816329}
1632916330
16331+ /// Return if a flat address space atomicrmw can access private memory.
16332+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16333+ const MDNode *NoaliasAddrSpaceMD =
16334+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16335+ if (!NoaliasAddrSpaceMD)
16336+ return true;
16337+
16338+ // FIXME: Can this actually fail? Why is this optional?
16339+ if (std::optional<ConstantRange> CR =
16340+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16341+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16342+ }
16343+
16344+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16345+ }
16346+
1633016347TargetLowering::AtomicExpansionKind
1633116348SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1633216349 unsigned AS = RMW->getPointerAddressSpace();
1633316350 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1633416351 return AtomicExpansionKind::NotAtomic;
1633516352
16353+ // 64-bit flat atomics that dynamically reside in private memory will silently
16354+ // be dropped.
16355+ //
16356+ // Note that we will emit a new copy of the original atomic in the expansion,
16357+ // which will be incrementally relegalized.
16358+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16359+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16360+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16361+ flatInstrMayAccessPrivate(RMW))
16362+ return AtomicExpansionKind::Expand;
16363+
1633616364 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1633716365 OptimizationRemarkEmitter ORE(RMW->getFunction());
1633816366 ORE.emit([=]() {
@@ -16731,20 +16759,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1673116759
1673216760 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1673316761 Op == AtomicRMWInst::Xor) {
16734- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16735- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16736- "this cannot be replaced with add");
16737- AI->setOperation(AtomicRMWInst::Add);
16738- return;
16762+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16763+ ConstVal && ConstVal->isNullValue()) {
16764+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16765+ AI->setOperation(AtomicRMWInst::Add);
16766+
16767+ // TODO: Turn the below private handling into a no-op for idempotent
16768+ // cases.
16769+ }
1673916770 }
1674016771
16741- assert(Subtarget->hasAtomicFaddInsts() &&
16742- "target should have atomic fadd instructions");
16743- assert(AI->getType()->isFloatTy() &&
16744- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16745- "generic atomicrmw expansion only supports FP32 operand in flat "
16746- "address space");
16747- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16772+ // The non-flat expansions should only perform the de-canonicalization of
16773+ // identity values.
16774+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16775+ return;
16776+
16777+ // FullFlatEmulation is true if we need to issue the private, shared, and
16778+ // global cases.
16779+ //
16780+ // If this is false, we are only dealing with the flat-targeting-private case,
16781+ // where we only insert a check for private and still use the flat instruction
16782+ // for global and shared.
16783+
16784+ // TODO: Avoid the private check for the fadd case depending on
16785+ // noalias.addrspace.
16786+
16787+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16788+ Subtarget->hasAtomicFaddInsts() &&
16789+ AI->getType()->isFloatTy();
1674816790
1674916791 // Given: atomicrmw fadd ptr %addr, float %val ordering
1675016792 //
@@ -16784,6 +16826,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678416826 //
1678516827 // atomicrmw.end:
1678616828 // [...]
16829+ //
16830+ //
16831+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16832+ // version that only inserts the private check, and uses the flat operation.
1678716833
1678816834 IRBuilder<> Builder(AI);
1678916835 LLVMContext &Ctx = Builder.getContext();
@@ -16795,9 +16841,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679516841 Function *F = BB->getParent();
1679616842 BasicBlock *ExitBB =
1679716843 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16798- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16799- BasicBlock *CheckPrivateBB =
16800- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16844+ BasicBlock *SharedBB = nullptr;
16845+
16846+ BasicBlock *CheckPrivateBB = BB;
16847+ if (FullFlatEmulation) {
16848+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16849+ CheckPrivateBB =
16850+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16851+ }
16852+
1680116853 BasicBlock *PrivateBB =
1680216854 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1680316855 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16810,23 +16862,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1681016862
1681116863 std::prev(BB->end())->eraseFromParent();
1681216864 Builder.SetInsertPoint(BB);
16813- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16814- {Addr}, nullptr, "is.shared");
16815- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1681616865
16817- Builder.SetInsertPoint(SharedBB);
16818- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16819- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16866+ Value *LoadedShared = nullptr;
16867+ if (FullFlatEmulation) {
16868+ CallInst *IsShared = Builder.CreateIntrinsic(
16869+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16870+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16871+ Builder.SetInsertPoint(SharedBB);
16872+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16873+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1682016874
16821- Instruction *Clone = AI->clone();
16822- Clone->insertInto(SharedBB, SharedBB->end());
16823- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824- .set(CastToLocal);
16825- Instruction * LoadedShared = Clone;
16875+ Instruction *Clone = AI->clone();
16876+ Clone->insertInto(SharedBB, SharedBB->end());
16877+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16878+ .set(CastToLocal);
16879+ LoadedShared = Clone;
1682616880
16827- Builder.CreateBr(PhiBB);
16881+ Builder.CreateBr(PhiBB);
16882+ Builder.SetInsertPoint(CheckPrivateBB);
16883+ }
1682816884
16829- Builder.SetInsertPoint(CheckPrivateBB);
1683016885 CallInst *IsPrivate = Builder.CreateIntrinsic(
1683116886 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1683216887 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16843,23 +16898,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684316898 Builder.CreateBr(PhiBB);
1684416899
1684516900 Builder.SetInsertPoint(GlobalBB);
16846- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16847- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16848- Value *LoadedGlobal = AI;
1684916901
16850- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16902+ // Continue using a flat instruction if we only emitted the check for private.
16903+ Instruction *LoadedGlobal = AI;
16904+ if (FullFlatEmulation) {
16905+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16906+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16907+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16908+ .set(CastToGlobal);
16909+ }
1685116910
1685216911 AI->removeFromParent();
1685316912 AI->insertInto(GlobalBB, GlobalBB->end());
1685416913
16914+ // The new atomicrmw may go through another round of legalization later.
16915+ if (!FullFlatEmulation) {
16916+ // We inserted the runtime check already, make sure we do not try to
16917+ // re-expand this.
16918+ // TODO: Should union with any existing metadata.
16919+ MDBuilder MDB(F->getContext());
16920+ MDNode *RangeNotPrivate =
16921+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16922+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16923+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16924+ RangeNotPrivate);
16925+ }
16926+
1685516927 Builder.CreateBr(PhiBB);
1685616928
1685716929 Builder.SetInsertPoint(PhiBB);
1685816930
1685916931 if (ReturnValueIsUsed) {
1686016932 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1686116933 AI->replaceAllUsesWith(Loaded);
16862- Loaded->addIncoming(LoadedShared, SharedBB);
16934+ if (FullFlatEmulation)
16935+ Loaded->addIncoming(LoadedShared, SharedBB);
1686316936 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1686416937 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1686516938 Loaded->takeName(AI);
0 commit comments