@@ -17300,26 +17300,80 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1730017300
1730117301 auto Op = RMW->getOperation();
1730217302 switch (Op) {
17303- case AtomicRMWInst::Xchg: {
17303+ case AtomicRMWInst::Xchg:
1730417304 // PCIe supports add and xchg for system atomics.
1730517305 return isAtomicRMWLegalXChgTy(RMW)
1730617306 ? TargetLowering::AtomicExpansionKind::None
1730717307 : TargetLowering::AtomicExpansionKind::CmpXChg;
17308- }
1730917308 case AtomicRMWInst::Add:
17310- case AtomicRMWInst::And:
17311- case AtomicRMWInst::UIncWrap:
17312- case AtomicRMWInst::UDecWrap:
17309+ // PCIe supports add and xchg for system atomics.
1731317310 return atomicSupportedIfLegalIntType(RMW);
1731417311 case AtomicRMWInst::Sub:
17312+ case AtomicRMWInst::And:
1731517313 case AtomicRMWInst::Or:
17316- case AtomicRMWInst::Xor: {
17317- // Atomic sub/or/xor do not work over PCI express, but atomic add
17318- // does. InstCombine transforms these with 0 to or, so undo that.
17319- if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17320- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17321- ConstVal && ConstVal->isNullValue())
17322- return AtomicExpansionKind::Expand;
17314+ case AtomicRMWInst::Xor:
17315+ case AtomicRMWInst::Max:
17316+ case AtomicRMWInst::Min:
17317+ case AtomicRMWInst::UMax:
17318+ case AtomicRMWInst::UMin:
17319+ case AtomicRMWInst::UIncWrap:
17320+ case AtomicRMWInst::UDecWrap: {
17321+ if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17322+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17323+ // On most subtargets, for atomicrmw operations other than add/xchg,
17324+ // whether or not the instructions will behave correctly depends on where
17325+ // the address physically resides and what interconnect is used in the
17326+ // system configuration. On some some targets the instruction will nop,
17327+ // and in others synchronization will only occur at degraded device scope.
17328+ //
17329+ // If the allocation is known local to the device, the instructions should
17330+ // work correctly.
17331+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17332+ return atomicSupportedIfLegalIntType(RMW);
17333+
17334+ // If fine-grained remote memory works at device scope, we don't need to
17335+ // do anything.
17336+ if (!HasSystemScope &&
17337+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17338+ return atomicSupportedIfLegalIntType(RMW);
17339+
17340+ // If we are targeting a remote allocated address, it depends what kind of
17341+ // allocation the address belongs to.
17342+ //
17343+ // If the allocation is fine-grained (in host memory, or in PCIe peer
17344+ // device memory), the operation will fail depending on the target.
17345+ //
17346+ // Note fine-grained host memory access does work on APUs or if XGMI is
17347+ // used, but we do not know if we are targeting an APU or the system
17348+ // configuration from the ISA version/target-cpu.
17349+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17350+ return atomicSupportedIfLegalIntType(RMW);
17351+
17352+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17353+ Op == AtomicRMWInst::Xor) {
17354+ // Atomic sub/or/xor do not work over PCI express, but atomic add
17355+ // does. InstCombine transforms these with 0 to or, so undo that.
17356+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17357+ ConstVal && ConstVal->isNullValue())
17358+ return AtomicExpansionKind::Expand;
17359+ }
17360+
17361+ // If the allocation could be in remote, fine-grained memory, the rmw
17362+ // instructions may fail. cmpxchg should work, so emit that. On some
17363+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
17364+ // even work, so you're out of luck anyway.
17365+
17366+ // In summary:
17367+ //
17368+ // Cases that may fail:
17369+ // - fine-grained pinned host memory
17370+ // - fine-grained migratable host memory
17371+ // - fine-grained PCIe peer device
17372+ //
17373+ // Cases that should work, but may be treated overly conservatively.
17374+ // - fine-grained host memory on an APU
17375+ // - fine-grained XGMI peer device
17376+ return AtomicExpansionKind::CmpXChg;
1732317377 }
1732417378
1732517379 return atomicSupportedIfLegalIntType(RMW);
@@ -17474,19 +17528,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1747417528
1747517529 return AtomicExpansionKind::CmpXChg;
1747617530 }
17477- case AtomicRMWInst::Min:
17478- case AtomicRMWInst::Max:
17479- case AtomicRMWInst::UMin:
17480- case AtomicRMWInst::UMax: {
17481- if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17482- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17483- // Always expand system scope min/max atomics.
17484- if (HasSystemScope)
17485- return AtomicExpansionKind::CmpXChg;
17486- }
17487-
17488- return atomicSupportedIfLegalIntType(RMW);
17489- }
1749017531 case AtomicRMWInst::Nand:
1749117532 case AtomicRMWInst::FSub:
1749217533 default:
0 commit comments