@@ -16854,26 +16854,80 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1685416854
1685516855 auto Op = RMW->getOperation();
1685616856 switch (Op) {
16857- case AtomicRMWInst::Xchg: {
16857+ case AtomicRMWInst::Xchg:
1685816858 // PCIe supports add and xchg for system atomics.
1685916859 return isAtomicRMWLegalXChgTy(RMW)
1686016860 ? TargetLowering::AtomicExpansionKind::None
1686116861 : TargetLowering::AtomicExpansionKind::CmpXChg;
16862- }
1686316862 case AtomicRMWInst::Add:
16864- case AtomicRMWInst::And:
16865- case AtomicRMWInst::UIncWrap:
16866- case AtomicRMWInst::UDecWrap:
16863+ // PCIe supports add and xchg for system atomics.
1686716864 return atomicSupportedIfLegalIntType(RMW);
1686816865 case AtomicRMWInst::Sub:
16866+ case AtomicRMWInst::And:
1686916867 case AtomicRMWInst::Or:
16870- case AtomicRMWInst::Xor: {
16871- // Atomic sub/or/xor do not work over PCI express, but atomic add
16872- // does. InstCombine transforms these with 0 to or, so undo that.
16873- if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16874- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16875- ConstVal && ConstVal->isNullValue())
16876- return AtomicExpansionKind::Expand;
16868+ case AtomicRMWInst::Xor:
16869+ case AtomicRMWInst::Max:
16870+ case AtomicRMWInst::Min:
16871+ case AtomicRMWInst::UMax:
16872+ case AtomicRMWInst::UMin:
16873+ case AtomicRMWInst::UIncWrap:
16874+ case AtomicRMWInst::UDecWrap: {
16875+ if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16876+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16877+ // On most subtargets, for atomicrmw operations other than add/xchg,
16878+ // whether or not the instructions will behave correctly depends on where
16879+ // the address physically resides and what interconnect is used in the
16880+ // system configuration. On some some targets the instruction will nop,
16881+ // and in others synchronization will only occur at degraded device scope.
16882+ //
16883+ // If the allocation is known local to the device, the instructions should
16884+ // work correctly.
16885+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
16886+ return atomicSupportedIfLegalIntType(RMW);
16887+
16888+ // If fine-grained remote memory works at device scope, we don't need to
16889+ // do anything.
16890+ if (!HasSystemScope &&
16891+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16892+ return atomicSupportedIfLegalIntType(RMW);
16893+
16894+ // If we are targeting a remote allocated address, it depends what kind of
16895+ // allocation the address belongs to.
16896+ //
16897+ // If the allocation is fine-grained (in host memory, or in PCIe peer
16898+ // device memory), the operation will fail depending on the target.
16899+ //
16900+ // Note fine-grained host memory access does work on APUs or if XGMI is
16901+ // used, but we do not know if we are targeting an APU or the system
16902+ // configuration from the ISA version/target-cpu.
16903+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16904+ return atomicSupportedIfLegalIntType(RMW);
16905+
16906+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16907+ Op == AtomicRMWInst::Xor) {
16908+ // Atomic sub/or/xor do not work over PCI express, but atomic add
16909+ // does. InstCombine transforms these with 0 to or, so undo that.
16910+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16911+ ConstVal && ConstVal->isNullValue())
16912+ return AtomicExpansionKind::Expand;
16913+ }
16914+
16915+ // If the allocation could be in remote, fine-grained memory, the rmw
16916+ // instructions may fail. cmpxchg should work, so emit that. On some
16917+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
16918+ // even work, so you're out of luck anyway.
16919+
16920+ // In summary:
16921+ //
16922+ // Cases that may fail:
16923+ // - fine-grained pinned host memory
16924+ // - fine-grained migratable host memory
16925+ // - fine-grained PCIe peer device
16926+ //
16927+ // Cases that should work, but may be treated overly conservatively.
16928+ // - fine-grained host memory on an APU
16929+ // - fine-grained XGMI peer device
16930+ return AtomicExpansionKind::CmpXChg;
1687716931 }
1687816932
1687916933 return atomicSupportedIfLegalIntType(RMW);
@@ -17028,19 +17082,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1702817082
1702917083 return AtomicExpansionKind::CmpXChg;
1703017084 }
17031- case AtomicRMWInst::Min:
17032- case AtomicRMWInst::Max:
17033- case AtomicRMWInst::UMin:
17034- case AtomicRMWInst::UMax: {
17035- if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17036- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17037- // Always expand system scope min/max atomics.
17038- if (HasSystemScope)
17039- return AtomicExpansionKind::CmpXChg;
17040- }
17041-
17042- return atomicSupportedIfLegalIntType(RMW);
17043- }
1704417085 case AtomicRMWInst::Nand:
1704517086 case AtomicRMWInst::FSub:
1704617087 default:
0 commit comments