@@ -17860,19 +17860,63 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1786017860 case AtomicRMWInst::UDecWrap: {
1786117861 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
1786217862 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17863- // Always expand system scope atomics.
17864- if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) {
17865- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17866- Op == AtomicRMWInst::Xor) {
17867- // Atomic sub/or/xor do not work over PCI express, but atomic add
17868- // does. InstCombine transforms these with 0 to or, so undo that.
17869- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17870- ConstVal && ConstVal->isNullValue())
17871- return AtomicExpansionKind::Expand;
17872- }
17873-
17874- return AtomicExpansionKind::CmpXChg;
17863+ if (Subtarget->hasEmulatedSystemScopeAtomics())
17864+ return atomicSupportedIfLegalIntType(RMW);
17865+
17866+ // On most subtargets, for atomicrmw operations other than add/xchg,
17867+ // whether or not the instructions will behave correctly depends on where
17868+ // the address physically resides and what interconnect is used in the
17869+ // system configuration. On some some targets the instruction will nop,
17870+ // and in others synchronization will only occur at degraded device scope.
17871+ //
17872+ // If the allocation is known local to the device, the instructions should
17873+ // work correctly.
17874+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17875+ return atomicSupportedIfLegalIntType(RMW);
17876+
17877+ // If fine-grained remote memory works at device scope, we don't need to
17878+ // do anything.
17879+ if (!HasSystemScope &&
17880+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17881+ return atomicSupportedIfLegalIntType(RMW);
17882+
17883+ // If we are targeting a remote allocated address, it depends what kind of
17884+ // allocation the address belongs to.
17885+ //
17886+ // If the allocation is fine-grained (in host memory, or in PCIe peer
17887+ // device memory), the operation will fail depending on the target.
17888+ //
17889+ // Note fine-grained host memory access does work on APUs or if XGMI is
17890+ // used, but we do not know if we are targeting an APU or the system
17891+ // configuration from the ISA version/target-cpu.
17892+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17893+ return atomicSupportedIfLegalIntType(RMW);
17894+
17895+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17896+ Op == AtomicRMWInst::Xor) {
17897+ // Atomic sub/or/xor do not work over PCI express, but atomic add
17898+ // does. InstCombine transforms these with 0 to or, so undo that.
17899+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17900+ ConstVal && ConstVal->isNullValue())
17901+ return AtomicExpansionKind::Expand;
1787517902 }
17903+
17904+ // If the allocation could be in remote, fine-grained memory, the rmw
17905+ // instructions may fail. cmpxchg should work, so emit that. On some
17906+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
17907+ // even work, so you're out of luck anyway.
17908+
17909+ // In summary:
17910+ //
17911+ // Cases that may fail:
17912+ // - fine-grained pinned host memory
17913+ // - fine-grained migratable host memory
17914+ // - fine-grained PCIe peer device
17915+ //
17916+ // Cases that should work, but may be treated overly conservatively.
17917+ // - fine-grained host memory on an APU
17918+ // - fine-grained XGMI peer device
17919+ return AtomicExpansionKind::CmpXChg;
1787617920 }
1787717921
1787817922 return atomicSupportedIfLegalIntType(RMW);
0 commit comments