3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16308,12 +16309,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1630816309 : TargetLowering::AtomicExpansionKind::CmpXChg;
1630916310}
1631016311
16312+ /// Return if a flat address space atomicrmw can access private memory.
16313+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16314+ const MDNode *NoaliasAddrSpaceMD =
16315+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16316+ if (!NoaliasAddrSpaceMD)
16317+ return true;
16318+
16319+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16320+ ++I) {
16321+ auto *Low = mdconst::extract<ConstantInt>(
16322+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16323+ auto *High = mdconst::extract<ConstantInt>(
16324+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16325+
16326+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16327+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16328+ return true;
16329+ }
16330+
16331+ return false;
16332+ }
16333+
1631116334TargetLowering::AtomicExpansionKind
1631216335SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1631316336 unsigned AS = RMW->getPointerAddressSpace();
1631416337 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1631516338 return AtomicExpansionKind::NotAtomic;
1631616339
16340+ // 64-bit flat atomics that dynamically reside in private memory will silently
16341+ // be dropped.
16342+ //
16343+ // Note that we will emit a new copy of the original atomic in the expansion,
16344+ // which will be incrementally relegalized.
16345+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16346+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16347+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16348+ flatInstrMayAccessPrivate(RMW))
16349+ return AtomicExpansionKind::Expand;
16350+
1631716351 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1631816352 OptimizationRemarkEmitter ORE(RMW->getFunction());
1631916353 ORE.emit([=]() {
@@ -16714,20 +16748,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671416748
1671516749 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1671616750 Op == AtomicRMWInst::Xor) {
16717- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16718- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16719- "this cannot be replaced with add");
16720- AI->setOperation(AtomicRMWInst::Add);
16721- return;
16751+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16752+ ConstVal && ConstVal->isNullValue()) {
16753+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16754+ AI->setOperation(AtomicRMWInst::Add);
16755+
16756+ // TODO: Turn the below private handling into a no-op for idempotent
16757+ // cases.
16758+ }
1672216759 }
1672316760
16724- assert(Subtarget->hasAtomicFaddInsts() &&
16725- "target should have atomic fadd instructions");
16726- assert(AI->getType()->isFloatTy() &&
16727- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16728- "generic atomicrmw expansion only supports FP32 operand in flat "
16729- "address space");
16730- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16761+ // The non-flat expansions should only perform the de-canonicalization of
16762+ // identity values.
16763+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16764+ return;
16765+
16766+ // FullFlatEmulation is true if we need to issue the private, shared, and
16767+ // global cases.
16768+ //
16769+ // If this is false, we are only dealing with the flat-targeting-private case,
16770+ // where we only insert a check for private and still use the flat instruction
16771+ // for global and shared.
16772+
16773+ // TODO: Avoid the private check for the fadd case depending on
16774+ // noalias.addrspace.
16775+
16776+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16777+ Subtarget->hasAtomicFaddInsts() &&
16778+ AI->getType()->isFloatTy();
1673116779
1673216780 // Given: atomicrmw fadd ptr %addr, float %val ordering
1673316781 //
@@ -16767,6 +16815,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676716815 //
1676816816 // atomicrmw.end:
1676916817 // [...]
16818+ //
16819+ //
16820+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16821+ // version that only inserts the private check, and uses the flat operation.
1677016822
1677116823 IRBuilder<> Builder(AI);
1677216824 LLVMContext &Ctx = Builder.getContext();
@@ -16778,9 +16830,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1677816830 Function *F = BB->getParent();
1677916831 BasicBlock *ExitBB =
1678016832 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16781- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16782- BasicBlock *CheckPrivateBB =
16783- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16833+ BasicBlock *SharedBB = nullptr;
16834+
16835+ BasicBlock *CheckPrivateBB = BB;
16836+ if (FullFlatEmulation) {
16837+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16838+ CheckPrivateBB =
16839+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16840+ }
16841+
1678416842 BasicBlock *PrivateBB =
1678516843 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1678616844 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16793,23 +16851,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679316851
1679416852 std::prev(BB->end())->eraseFromParent();
1679516853 Builder.SetInsertPoint(BB);
16796- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16797- {Addr}, nullptr, "is.shared");
16798- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1679916854
16800- Builder.SetInsertPoint(SharedBB);
16801- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16802- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16855+ Value *LoadedShared = nullptr;
16856+ if (FullFlatEmulation) {
16857+ CallInst *IsShared = Builder.CreateIntrinsic(
16858+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16859+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16860+ Builder.SetInsertPoint(SharedBB);
16861+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16862+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1680316863
16804- Instruction *Clone = AI->clone();
16805- Clone->insertInto(SharedBB, SharedBB->end());
16806- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16807- .set(CastToLocal);
16808- Instruction * LoadedShared = Clone;
16864+ Instruction *Clone = AI->clone();
16865+ Clone->insertInto(SharedBB, SharedBB->end());
16866+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16867+ .set(CastToLocal);
16868+ LoadedShared = Clone;
1680916869
16810- Builder.CreateBr(PhiBB);
16870+ Builder.CreateBr(PhiBB);
16871+ Builder.SetInsertPoint(CheckPrivateBB);
16872+ }
1681116873
16812- Builder.SetInsertPoint(CheckPrivateBB);
1681316874 CallInst *IsPrivate = Builder.CreateIntrinsic(
1681416875 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1681516876 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16826,23 +16887,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682616887 Builder.CreateBr(PhiBB);
1682716888
1682816889 Builder.SetInsertPoint(GlobalBB);
16829- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16830- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16831- Value *LoadedGlobal = AI;
1683216890
16833- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16891+ // Continue using a flat instruction if we only emitted the check for private.
16892+ Instruction *LoadedGlobal = AI;
16893+ if (FullFlatEmulation) {
16894+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16895+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16896+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897+ .set(CastToGlobal);
16898+ }
1683416899
1683516900 AI->removeFromParent();
1683616901 AI->insertInto(GlobalBB, GlobalBB->end());
1683716902
16903+ // The new atomicrmw may go through another round of legalization later.
16904+ if (!FullFlatEmulation) {
16905+ // We inserted the runtime check already, make sure we do not try to
16906+ // re-expand this.
16907+ // TODO: Should union with any existing metadata.
16908+ MDBuilder MDB(F->getContext());
16909+ MDNode *RangeNotPrivate =
16910+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16911+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16912+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16913+ RangeNotPrivate);
16914+ }
16915+
1683816916 Builder.CreateBr(PhiBB);
1683916917
1684016918 Builder.SetInsertPoint(PhiBB);
1684116919
1684216920 if (ReturnValueIsUsed) {
1684316921 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1684416922 AI->replaceAllUsesWith(Loaded);
16845- Loaded->addIncoming(LoadedShared, SharedBB);
16923+ if (FullFlatEmulation)
16924+ Loaded->addIncoming(LoadedShared, SharedBB);
1684616925 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1684716926 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1684816927 Loaded->takeName(AI);
0 commit comments