@@ -17864,19 +17864,63 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1786417864 case AtomicRMWInst::UDecWrap: {
1786517865 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
1786617866 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17867- // Always expand system scope atomics.
17868- if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) {
17869- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17870- Op == AtomicRMWInst::Xor) {
17871- // Atomic sub/or/xor do not work over PCI express, but atomic add
17872- // does. InstCombine transforms these with 0 to or, so undo that.
17873- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17874- ConstVal && ConstVal->isNullValue())
17875- return AtomicExpansionKind::Expand;
17876- }
17877-
17878- return AtomicExpansionKind::CmpXChg;
17867+ if (Subtarget->hasEmulatedSystemScopeAtomics())
17868+ return atomicSupportedIfLegalIntType(RMW);
17869+
17870+ // On most subtargets, for atomicrmw operations other than add/xchg,
17871+ // whether or not the instructions will behave correctly depends on where
17872+ // the address physically resides and what interconnect is used in the
17873+ // system configuration. On some some targets the instruction will nop,
17874+ // and in others synchronization will only occur at degraded device scope.
17875+ //
17876+ // If the allocation is known local to the device, the instructions should
17877+ // work correctly.
17878+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17879+ return atomicSupportedIfLegalIntType(RMW);
17880+
17881+ // If fine-grained remote memory works at device scope, we don't need to
17882+ // do anything.
17883+ if (!HasSystemScope &&
17884+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17885+ return atomicSupportedIfLegalIntType(RMW);
17886+
17887+ // If we are targeting a remote allocated address, it depends what kind of
17888+ // allocation the address belongs to.
17889+ //
17890+ // If the allocation is fine-grained (in host memory, or in PCIe peer
17891+ // device memory), the operation will fail depending on the target.
17892+ //
17893+ // Note fine-grained host memory access does work on APUs or if XGMI is
17894+ // used, but we do not know if we are targeting an APU or the system
17895+ // configuration from the ISA version/target-cpu.
17896+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17897+ return atomicSupportedIfLegalIntType(RMW);
17898+
17899+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17900+ Op == AtomicRMWInst::Xor) {
17901+ // Atomic sub/or/xor do not work over PCI express, but atomic add
17902+ // does. InstCombine transforms these with 0 to or, so undo that.
17903+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17904+ ConstVal && ConstVal->isNullValue())
17905+ return AtomicExpansionKind::Expand;
1787917906 }
17907+
17908+ // If the allocation could be in remote, fine-grained memory, the rmw
17909+ // instructions may fail. cmpxchg should work, so emit that. On some
17910+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
17911+ // even work, so you're out of luck anyway.
17912+
17913+ // In summary:
17914+ //
17915+ // Cases that may fail:
17916+ // - fine-grained pinned host memory
17917+ // - fine-grained migratable host memory
17918+ // - fine-grained PCIe peer device
17919+ //
17920+ // Cases that should work, but may be treated overly conservatively.
17921+ // - fine-grained host memory on an APU
17922+ // - fine-grained XGMI peer device
17923+ return AtomicExpansionKind::CmpXChg;
1788017924 }
1788117925
1788217926 return atomicSupportedIfLegalIntType(RMW);
0 commit comments