@@ -16874,19 +16874,60 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16874
16874
case AtomicRMWInst::UDecWrap: {
16875
16875
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16876
16876
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16877
- // Always expand system scope atomics.
16878
- if (HasSystemScope) {
16879
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16880
- Op == AtomicRMWInst::Xor) {
16881
- // Atomic sub/or/xor do not work over PCI express, but atomic add
16882
- // does. InstCombine transforms these with 0 to or, so undo that.
16883
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16884
- ConstVal && ConstVal->isNullValue())
16885
- return AtomicExpansionKind::Expand;
16886
- }
16887
-
16888
- return AtomicExpansionKind::CmpXChg;
16877
+ // On most subtargets, for atomicrmw operations other than add/xchg,
16878
+ // whether or not the instructions will behave correctly depends on where
16879
+ // the address physically resides and what interconnect is used in the
16880
+ // system configuration. On some some targets the instruction will nop,
16881
+ // and in others synchronization will only occur at degraded device scope.
16882
+ //
16883
+ // If the allocation is known local to the device, the instructions should
16884
+ // work correctly.
16885
+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
16886
+ return atomicSupportedIfLegalIntType(RMW);
16887
+
16888
+ // If fine-grained remote memory works at device scope, we don't need to
16889
+ // do anything.
16890
+ if (!HasSystemScope &&
16891
+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16892
+ return atomicSupportedIfLegalIntType(RMW);
16893
+
16894
+ // If we are targeting a remote allocated address, it depends what kind of
16895
+ // allocation the address belongs to.
16896
+ //
16897
+ // If the allocation is fine-grained (in host memory, or in PCIe peer
16898
+ // device memory), the operation will fail depending on the target.
16899
+ //
16900
+ // Note fine-grained host memory access does work on APUs or if XGMI is
16901
+ // used, but we do not know if we are targeting an APU or the system
16902
+ // configuration from the ISA version/target-cpu.
16903
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16904
+ return atomicSupportedIfLegalIntType(RMW);
16905
+
16906
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16907
+ Op == AtomicRMWInst::Xor) {
16908
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
16909
+ // does. InstCombine transforms these with 0 to or, so undo that.
16910
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16911
+ ConstVal && ConstVal->isNullValue())
16912
+ return AtomicExpansionKind::Expand;
16889
16913
}
16914
+
16915
+ // If the allocation could be in remote, fine-grained memory, the rmw
16916
+ // instructions may fail. cmpxchg should work, so emit that. On some
16917
+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
16918
+ // even work, so you're out of luck anyway.
16919
+
16920
+ // In summary:
16921
+ //
16922
+ // Cases that may fail:
16923
+ // - fine-grained pinned host memory
16924
+ // - fine-grained migratable host memory
16925
+ // - fine-grained PCIe peer device
16926
+ //
16927
+ // Cases that should work, but may be treated overly conservatively.
16928
+ // - fine-grained host memory on an APU
16929
+ // - fine-grained XGMI peer device
16930
+ return AtomicExpansionKind::CmpXChg;
16890
16931
}
16891
16932
16892
16933
return atomicSupportedIfLegalIntType(RMW);
0 commit comments