3939#include "llvm/IR/IntrinsicInst.h"
4040#include "llvm/IR/IntrinsicsAMDGPU.h"
4141#include "llvm/IR/IntrinsicsR600.h"
42+ #include "llvm/IR/MDBuilder.h"
4243#include "llvm/Support/CommandLine.h"
4344#include "llvm/Support/KnownBits.h"
4445#include "llvm/Support/ModRef.h"
@@ -16340,12 +16341,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1634016341 : TargetLowering::AtomicExpansionKind::CmpXChg;
1634116342}
1634216343
16344+ /// Return if a flat address space atomicrmw can access private memory.
16345+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16346+ const MDNode *NoaliasAddrSpaceMD =
16347+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16348+ if (!NoaliasAddrSpaceMD)
16349+ return true;
16350+
16351+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16352+ ++I) {
16353+ auto *Low = mdconst::extract<ConstantInt>(
16354+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16355+ auto *High = mdconst::extract<ConstantInt>(
16356+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16357+
16358+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16359+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16360+ return true;
16361+ }
16362+
16363+ return false;
16364+ }
16365+
1634316366TargetLowering::AtomicExpansionKind
1634416367SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1634516368 unsigned AS = RMW->getPointerAddressSpace();
1634616369 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1634716370 return AtomicExpansionKind::NotAtomic;
1634816371
16372+ // 64-bit flat atomics that dynamically reside in private memory will silently
16373+ // be dropped.
16374+ //
16375+ // Note that we will emit a new copy of the original atomic in the expansion,
16376+ // which will be incrementally relegalized.
16377+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16378+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16379+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16380+ flatInstrMayAccessPrivate(RMW))
16381+ return AtomicExpansionKind::Expand;
16382+
1634916383 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1635016384 OptimizationRemarkEmitter ORE(RMW->getFunction());
1635116385 ORE.emit([=]() {
@@ -16744,20 +16778,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1674416778
1674516779 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1674616780 Op == AtomicRMWInst::Xor) {
16747- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16748- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16749- "this cannot be replaced with add");
16750- AI->setOperation(AtomicRMWInst::Add);
16751- return;
16781+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16782+ ConstVal && ConstVal->isNullValue()) {
16783+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16784+ AI->setOperation(AtomicRMWInst::Add);
16785+
16786+ // TODO: Turn the below private handling into a no-op for idempotent
16787+ // cases.
16788+ }
1675216789 }
1675316790
16754- assert(Subtarget->hasAtomicFaddInsts() &&
16755- "target should have atomic fadd instructions");
16756- assert(AI->getType()->isFloatTy() &&
16757- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16758- "generic atomicrmw expansion only supports FP32 operand in flat "
16759- "address space");
16760- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16791+ // The non-flat expansions should only perform the de-canonicalization of
16792+ // identity values.
16793+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16794+ return;
16795+
16796+ // FullFlatEmulation is true if we need to issue the private, shared, and
16797+ // global cases.
16798+ //
16799+ // If this is false, we are only dealing with the flat-targeting-private case,
16800+ // where we only insert a check for private and still use the flat instruction
16801+ // for global and shared.
16802+
16803+ // TODO: Avoid the private check for the fadd case depending on
16804+ // noalias.addrspace.
16805+
16806+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16807+ Subtarget->hasAtomicFaddInsts() &&
16808+ AI->getType()->isFloatTy();
1676116809
1676216810 // Given: atomicrmw fadd ptr %addr, float %val ordering
1676316811 //
@@ -16797,6 +16845,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679716845 //
1679816846 // atomicrmw.end:
1679916847 // [...]
16848+ //
16849+ //
16850+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16851+ // version that only inserts the private check, and uses the flat operation.
1680016852
1680116853 IRBuilder<> Builder(AI);
1680216854 LLVMContext &Ctx = Builder.getContext();
@@ -16808,9 +16860,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1680816860 Function *F = BB->getParent();
1680916861 BasicBlock *ExitBB =
1681016862 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16811- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16812- BasicBlock *CheckPrivateBB =
16813- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16863+ BasicBlock *SharedBB = nullptr;
16864+
16865+ BasicBlock *CheckPrivateBB = BB;
16866+ if (FullFlatEmulation) {
16867+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16868+ CheckPrivateBB =
16869+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16870+ }
16871+
1681416872 BasicBlock *PrivateBB =
1681516873 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1681616874 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16823,23 +16881,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682316881
1682416882 std::prev(BB->end())->eraseFromParent();
1682516883 Builder.SetInsertPoint(BB);
16826- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16827- {Addr}, nullptr, "is.shared");
16828- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1682916884
16830- Builder.SetInsertPoint(SharedBB);
16831- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16832- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16885+ Value *LoadedShared = nullptr;
16886+ if (FullFlatEmulation) {
16887+ CallInst *IsShared = Builder.CreateIntrinsic(
16888+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16889+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16890+ Builder.SetInsertPoint(SharedBB);
16891+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16892+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1683316893
16834- Instruction *Clone = AI->clone();
16835- Clone->insertInto(SharedBB, SharedBB->end());
16836- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16837- .set(CastToLocal);
16838- Instruction * LoadedShared = Clone;
16894+ Instruction *Clone = AI->clone();
16895+ Clone->insertInto(SharedBB, SharedBB->end());
16896+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897+ .set(CastToLocal);
16898+ LoadedShared = Clone;
1683916899
16840- Builder.CreateBr(PhiBB);
16900+ Builder.CreateBr(PhiBB);
16901+ Builder.SetInsertPoint(CheckPrivateBB);
16902+ }
1684116903
16842- Builder.SetInsertPoint(CheckPrivateBB);
1684316904 CallInst *IsPrivate = Builder.CreateIntrinsic(
1684416905 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1684516906 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16856,23 +16917,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685616917 Builder.CreateBr(PhiBB);
1685716918
1685816919 Builder.SetInsertPoint(GlobalBB);
16859- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16860- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16861- Value *LoadedGlobal = AI;
1686216920
16863- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16921+ // Continue using a flat instruction if we only emitted the check for private.
16922+ Instruction *LoadedGlobal = AI;
16923+ if (FullFlatEmulation) {
16924+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16925+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16926+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16927+ .set(CastToGlobal);
16928+ }
1686416929
1686516930 AI->removeFromParent();
1686616931 AI->insertInto(GlobalBB, GlobalBB->end());
1686716932
16933+ // The new atomicrmw may go through another round of legalization later.
16934+ if (!FullFlatEmulation) {
16935+ // We inserted the runtime check already, make sure we do not try to
16936+ // re-expand this.
16937+ // TODO: Should union with any existing metadata.
16938+ MDBuilder MDB(F->getContext());
16939+ MDNode *RangeNotPrivate =
16940+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16941+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16942+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16943+ RangeNotPrivate);
16944+ }
16945+
1686816946 Builder.CreateBr(PhiBB);
1686916947
1687016948 Builder.SetInsertPoint(PhiBB);
1687116949
1687216950 if (ReturnValueIsUsed) {
1687316951 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1687416952 AI->replaceAllUsesWith(Loaded);
16875- Loaded->addIncoming(LoadedShared, SharedBB);
16953+ if (FullFlatEmulation)
16954+ Loaded->addIncoming(LoadedShared, SharedBB);
1687616955 Loaded->addIncoming(LoadedPrivate, PrivateBB);
1687716956 Loaded->addIncoming(LoadedGlobal, GlobalBB);
1687816957 Loaded->takeName(AI);
0 commit comments