@@ -16939,19 +16939,60 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1693916939 case AtomicRMWInst::UDecWrap: {
1694016940 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
1694116941 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16942- // Always expand system scope atomics.
16943- if (HasSystemScope) {
16944- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16945- Op == AtomicRMWInst::Xor) {
16946- // Atomic sub/or/xor do not work over PCI express, but atomic add
16947- // does. InstCombine transforms these with 0 to or, so undo that.
16948- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16949- ConstVal && ConstVal->isNullValue())
16950- return AtomicExpansionKind::Expand;
16951- }
16952-
16953- return AtomicExpansionKind::CmpXChg;
16942+ // On most subtargets, for atomicrmw operations other than add/xchg,
16943+ // whether or not the instructions will behave correctly depends on where
16944+ // the address physically resides and what interconnect is used in the
16945+ // system configuration. On some some targets the instruction will nop,
16946+ // and in others synchronization will only occur at degraded device scope.
16947+ //
16948+ // If the allocation is known local to the device, the instructions should
16949+ // work correctly.
16950+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
16951+ return atomicSupportedIfLegalIntType(RMW);
16952+
16953+ // If fine-grained remote memory works at device scope, we don't need to
16954+ // do anything.
16955+ if (!HasSystemScope &&
16956+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16957+ return atomicSupportedIfLegalIntType(RMW);
16958+
16959+ // If we are targeting a remote allocated address, it depends what kind of
16960+ // allocation the address belongs to.
16961+ //
16962+ // If the allocation is fine-grained (in host memory, or in PCIe peer
16963+ // device memory), the operation will fail depending on the target.
16964+ //
16965+ // Note fine-grained host memory access does work on APUs or if XGMI is
16966+ // used, but we do not know if we are targeting an APU or the system
16967+ // configuration from the ISA version/target-cpu.
16968+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16969+ return atomicSupportedIfLegalIntType(RMW);
16970+
16971+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16972+ Op == AtomicRMWInst::Xor) {
16973+ // Atomic sub/or/xor do not work over PCI express, but atomic add
16974+ // does. InstCombine transforms these with 0 to or, so undo that.
16975+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16976+ ConstVal && ConstVal->isNullValue())
16977+ return AtomicExpansionKind::Expand;
1695416978 }
16979+
16980+ // If the allocation could be in remote, fine-grained memory, the rmw
16981+ // instructions may fail. cmpxchg should work, so emit that. On some
16982+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
16983+ // even work, so you're out of luck anyway.
16984+
16985+ // In summary:
16986+ //
16987+ // Cases that may fail:
16988+ // - fine-grained pinned host memory
16989+ // - fine-grained migratable host memory
16990+ // - fine-grained PCIe peer device
16991+ //
16992+ // Cases that should work, but may be treated overly conservatively.
16993+ // - fine-grained host memory on an APU
16994+ // - fine-grained XGMI peer device
16995+ return AtomicExpansionKind::CmpXChg;
1695516996 }
1695616997
1695716998 return atomicSupportedIfLegalIntType(RMW);
0 commit comments