3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16328,12 +16329,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1632816329 : TargetLowering::AtomicExpansionKind::CmpXChg;
1632916330}
1633016331
16332+ /// Return if a flat address space atomicrmw can access private memory.
16333+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16334+ const MDNode *NoaliasAddrSpaceMD =
16335+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16336+ if (!NoaliasAddrSpaceMD)
16337+ return true;
16338+
16339+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16340+ ++I) {
16341+ auto *Low = mdconst::extract<ConstantInt>(
16342+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16343+ auto *High = mdconst::extract<ConstantInt>(
16344+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16345+
16346+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16347+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16348+ return true;
16349+ }
16350+
16351+ return false;
16352+ }
16353+
1633116354TargetLowering::AtomicExpansionKind
1633216355SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1633316356 unsigned AS = RMW->getPointerAddressSpace();
1633416357 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1633516358 return AtomicExpansionKind::NotAtomic;
1633616359
16360+ // 64-bit flat atomics that dynamically reside in private memory will silently
16361+ // be dropped.
16362+ //
16363+ // Note that we will emit a new copy of the original atomic in the expansion,
16364+ // which will be incrementally relegalized.
16365+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16366+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16367+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16368+ flatInstrMayAccessPrivate(RMW))
16369+ return AtomicExpansionKind::Expand;
16370+
1633716371 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1633816372 OptimizationRemarkEmitter ORE(RMW->getFunction());
1633916373 ORE.emit([=]() {
@@ -16732,20 +16766,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1673216766
1673316767 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1673416768 Op == AtomicRMWInst::Xor) {
16735- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16736- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16737- "this cannot be replaced with add");
16738- AI->setOperation(AtomicRMWInst::Add);
16739- return;
16769+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16770+ ConstVal && ConstVal->isNullValue()) {
16771+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16772+ AI->setOperation(AtomicRMWInst::Add);
16773+
16774+ // TODO: Turn the below private handling into a no-op for idempotent
16775+ // cases.
16776+ }
1674016777 }
1674116778
16742- assert(Subtarget->hasAtomicFaddInsts() &&
16743- "target should have atomic fadd instructions");
16744- assert(AI->getType()->isFloatTy() &&
16745- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16746- "generic atomicrmw expansion only supports FP32 operand in flat "
16747- "address space");
16748- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16779+ // The non-flat expansions should only perform the de-canonicalization of
16780+ // identity values.
16781+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16782+ return;
16783+
16784+ // FullFlatEmulation is true if we need to issue the private, shared, and
16785+ // global cases.
16786+ //
16787+ // If this is false, we are only dealing with the flat-targeting-private case,
16788+ // where we only insert a check for private and still use the flat instruction
16789+ // for global and shared.
16790+
16791+ // TODO: Avoid the private check for the fadd case depending on
16792+ // noalias.addrspace.
16793+
16794+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16795+ Subtarget->hasAtomicFaddInsts() &&
16796+ AI->getType()->isFloatTy();
1674916797
1675016798 // Given: atomicrmw fadd ptr %addr, float %val ordering
1675116799 //
@@ -16785,6 +16833,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678516833 //
1678616834 // atomicrmw.end:
1678716835 // [...]
16836+ //
16837+ //
16838+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16839+ // version that only inserts the private check, and uses the flat operation.
1678816840
1678916841 IRBuilder<> Builder(AI);
1679016842 LLVMContext &Ctx = Builder.getContext();
@@ -16796,9 +16848,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679616848 Function *F = BB->getParent();
1679716849 BasicBlock *ExitBB =
1679816850 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16799- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16800- BasicBlock *CheckPrivateBB =
16801- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16851+ BasicBlock *SharedBB = nullptr;
16852+
16853+ BasicBlock *CheckPrivateBB = BB;
16854+ if (FullFlatEmulation) {
16855+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16856+ CheckPrivateBB =
16857+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16858+ }
16859+
1680216860 BasicBlock *PrivateBB =
1680316861 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1680416862 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16811,23 +16869,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1681116869
1681216870 std::prev(BB->end())->eraseFromParent();
1681316871 Builder.SetInsertPoint(BB);
16814- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16815- {Addr}, nullptr, "is.shared");
16816- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1681716872
16818- Builder.SetInsertPoint(SharedBB);
16819- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16820- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16873+ Value *LoadedShared = nullptr;
16874+ if (FullFlatEmulation) {
16875+ CallInst *IsShared = Builder.CreateIntrinsic(
16876+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16877+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16878+ Builder.SetInsertPoint(SharedBB);
16879+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16880+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1682116881
16822- Instruction *Clone = AI->clone();
16823- Clone->insertInto(SharedBB, SharedBB->end());
16824- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16825- .set(CastToLocal);
16826- Instruction * LoadedShared = Clone;
16882+ Instruction *Clone = AI->clone();
16883+ Clone->insertInto(SharedBB, SharedBB->end());
16884+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16885+ .set(CastToLocal);
16886+ LoadedShared = Clone;
1682716887
16828- Builder.CreateBr(PhiBB);
16888+ Builder.CreateBr(PhiBB);
16889+ Builder.SetInsertPoint(CheckPrivateBB);
16890+ }
1682916891
16830- Builder.SetInsertPoint(CheckPrivateBB);
1683116892 CallInst *IsPrivate = Builder.CreateIntrinsic(
1683216893 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1683316894 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16844,23 +16905,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684416905 Builder.CreateBr(PhiBB);
1684516906
1684616907 Builder.SetInsertPoint(GlobalBB);
16847- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16848- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16849- Value *LoadedGlobal = AI;
1685016908
16851- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16909+ // Continue using a flat instruction if we only emitted the check for private.
16910+ Instruction *LoadedGlobal = AI;
16911+ if (FullFlatEmulation) {
16912+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16913+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16914+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16915+ .set(CastToGlobal);
16916+ }
1685216917
1685316918 AI->removeFromParent();
1685416919 AI->insertInto(GlobalBB, GlobalBB->end());
1685516920
16921+ // The new atomicrmw may go through another round of legalization later.
16922+ if (!FullFlatEmulation) {
16923+ // We inserted the runtime check already, make sure we do not try to
16924+ // re-expand this.
16925+ // TODO: Should union with any existing metadata.
16926+ MDBuilder MDB(F->getContext());
16927+ MDNode *RangeNotPrivate =
16928+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16929+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16930+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16931+ RangeNotPrivate);
16932+ }
16933+
1685616934 Builder.CreateBr(PhiBB);
1685716935
1685816936 Builder.SetInsertPoint(PhiBB);
1685916937
1686016938 if (ReturnValueIsUsed) {
1686116939 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1686216940 AI->replaceAllUsesWith(Loaded);
16863- Loaded->addIncoming(LoadedShared, SharedBB);
16941+ if (FullFlatEmulation)
16942+ Loaded->addIncoming(LoadedShared, SharedBB);
1686416943 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1686516944 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1686616945 Loaded->takeName(AI);
0 commit comments