3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16310,12 +16311,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1631016311 : TargetLowering::AtomicExpansionKind::CmpXChg;
1631116312}
1631216313
16314+ /// Return if a flat address space atomicrmw can access private memory.
16315+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16316+ const MDNode *NoaliasAddrSpaceMD =
16317+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16318+ if (!NoaliasAddrSpaceMD)
16319+ return true;
16320+
16321+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16322+ ++I) {
16323+ auto *Low = mdconst::extract<ConstantInt>(
16324+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16325+ auto *High = mdconst::extract<ConstantInt>(
16326+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16327+
16328+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16329+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16330+ return true;
16331+ }
16332+
16333+ return false;
16334+ }
16335+
1631316336TargetLowering::AtomicExpansionKind
1631416337SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1631516338 unsigned AS = RMW->getPointerAddressSpace();
1631616339 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1631716340 return AtomicExpansionKind::NotAtomic;
1631816341
16342+ // 64-bit flat atomics that dynamically reside in private memory will silently
16343+ // be dropped.
16344+ //
16345+ // Note that we will emit a new copy of the original atomic in the expansion,
16346+ // which will be incrementally relegalized.
16347+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16348+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16349+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16350+ flatInstrMayAccessPrivate(RMW))
16351+ return AtomicExpansionKind::Expand;
16352+
1631916353 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1632016354 OptimizationRemarkEmitter ORE(RMW->getFunction());
1632116355 ORE.emit([=]() {
@@ -16716,20 +16750,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671616750
1671716751 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1671816752 Op == AtomicRMWInst::Xor) {
16719- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16720- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16721- "this cannot be replaced with add");
16722- AI->setOperation(AtomicRMWInst::Add);
16723- return;
16753+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16754+ ConstVal && ConstVal->isNullValue()) {
16755+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16756+ AI->setOperation(AtomicRMWInst::Add);
16757+
16758+ // TODO: Turn the below private handling into a no-op for idempotent
16759+ // cases.
16760+ }
1672416761 }
1672516762
16726- assert(Subtarget->hasAtomicFaddInsts() &&
16727- "target should have atomic fadd instructions");
16728- assert(AI->getType()->isFloatTy() &&
16729- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16730- "generic atomicrmw expansion only supports FP32 operand in flat "
16731- "address space");
16732- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16763+ // The non-flat expansions should only perform the de-canonicalization of
16764+ // identity values.
16765+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16766+ return;
16767+
16768+ // FullFlatEmulation is true if we need to issue the private, shared, and
16769+ // global cases.
16770+ //
16771+ // If this is false, we are only dealing with the flat-targeting-private case,
16772+ // where we only insert a check for private and still use the flat instruction
16773+ // for global and shared.
16774+
16775+ // TODO: Avoid the private check for the fadd case depending on
16776+ // noalias.addrspace.
16777+
16778+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16779+ Subtarget->hasAtomicFaddInsts() &&
16780+ AI->getType()->isFloatTy();
1673316781
1673416782 // Given: atomicrmw fadd ptr %addr, float %val ordering
1673516783 //
@@ -16769,6 +16817,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676916817 //
1677016818 // atomicrmw.end:
1677116819 // [...]
16820+ //
16821+ //
16822+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16823+ // version that only inserts the private check, and uses the flat operation.
1677216824
1677316825 IRBuilder<> Builder(AI);
1677416826 LLVMContext &Ctx = Builder.getContext();
@@ -16780,9 +16832,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678016832 Function *F = BB->getParent();
1678116833 BasicBlock *ExitBB =
1678216834 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16783- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16784- BasicBlock *CheckPrivateBB =
16785- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16835+ BasicBlock *SharedBB = nullptr;
16836+
16837+ BasicBlock *CheckPrivateBB = BB;
16838+ if (FullFlatEmulation) {
16839+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16840+ CheckPrivateBB =
16841+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16842+ }
16843+
1678616844 BasicBlock *PrivateBB =
1678716845 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1678816846 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16795,23 +16853,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679516853
1679616854 std::prev(BB->end())->eraseFromParent();
1679716855 Builder.SetInsertPoint(BB);
16798- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16799- {Addr}, nullptr, "is.shared");
16800- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1680116856
16802- Builder.SetInsertPoint(SharedBB);
16803- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16804- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16857+ Value *LoadedShared = nullptr;
16858+ if (FullFlatEmulation) {
16859+ CallInst *IsShared = Builder.CreateIntrinsic(
16860+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16861+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16862+ Builder.SetInsertPoint(SharedBB);
16863+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16864+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1680516865
16806- Instruction *Clone = AI->clone();
16807- Clone->insertInto(SharedBB, SharedBB->end());
16808- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16809- .set(CastToLocal);
16810- Instruction * LoadedShared = Clone;
16866+ Instruction *Clone = AI->clone();
16867+ Clone->insertInto(SharedBB, SharedBB->end());
16868+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16869+ .set(CastToLocal);
16870+ LoadedShared = Clone;
1681116871
16812- Builder.CreateBr(PhiBB);
16872+ Builder.CreateBr(PhiBB);
16873+ Builder.SetInsertPoint(CheckPrivateBB);
16874+ }
1681316875
16814- Builder.SetInsertPoint(CheckPrivateBB);
1681516876 CallInst *IsPrivate = Builder.CreateIntrinsic(
1681616877 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1681716878 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16828,23 +16889,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682816889 Builder.CreateBr(PhiBB);
1682916890
1683016891 Builder.SetInsertPoint(GlobalBB);
16831- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16832- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16833- Value *LoadedGlobal = AI;
1683416892
16835- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16893+ // Continue using a flat instruction if we only emitted the check for private.
16894+ Instruction *LoadedGlobal = AI;
16895+ if (FullFlatEmulation) {
16896+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16897+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16898+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16899+ .set(CastToGlobal);
16900+ }
1683616901
1683716902 AI->removeFromParent();
1683816903 AI->insertInto(GlobalBB, GlobalBB->end());
1683916904
16905+ // The new atomicrmw may go through another round of legalization later.
16906+ if (!FullFlatEmulation) {
16907+ // We inserted the runtime check already, make sure we do not try to
16908+ // re-expand this.
16909+ // TODO: Should union with any existing metadata.
16910+ MDBuilder MDB(F->getContext());
16911+ MDNode *RangeNotPrivate =
16912+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16913+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16914+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16915+ RangeNotPrivate);
16916+ }
16917+
1684016918 Builder.CreateBr(PhiBB);
1684116919
1684216920 Builder.SetInsertPoint(PhiBB);
1684316921
1684416922 if (ReturnValueIsUsed) {
1684516923 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1684616924 AI->replaceAllUsesWith(Loaded);
16847- Loaded->addIncoming(LoadedShared, SharedBB);
16925+ if (FullFlatEmulation)
16926+ Loaded->addIncoming(LoadedShared, SharedBB);
1684816927 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1684916928 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1685016929 Loaded->takeName(AI);
0 commit comments