@@ -16854,26 +16854,80 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16854
16854
16855
16855
auto Op = RMW->getOperation();
16856
16856
switch (Op) {
16857
- case AtomicRMWInst::Xchg: {
16857
+ case AtomicRMWInst::Xchg:
16858
16858
// PCIe supports add and xchg for system atomics.
16859
16859
return isAtomicRMWLegalXChgTy(RMW)
16860
16860
? TargetLowering::AtomicExpansionKind::None
16861
16861
: TargetLowering::AtomicExpansionKind::CmpXChg;
16862
- }
16863
16862
case AtomicRMWInst::Add:
16864
- case AtomicRMWInst::And:
16865
- case AtomicRMWInst::UIncWrap:
16866
- case AtomicRMWInst::UDecWrap:
16863
+ // PCIe supports add and xchg for system atomics.
16867
16864
return atomicSupportedIfLegalIntType(RMW);
16868
16865
case AtomicRMWInst::Sub:
16866
+ case AtomicRMWInst::And:
16869
16867
case AtomicRMWInst::Or:
16870
- case AtomicRMWInst::Xor: {
16871
- // Atomic sub/or/xor do not work over PCI express, but atomic add
16872
- // does. InstCombine transforms these with 0 to or, so undo that.
16873
- if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16874
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16875
- ConstVal && ConstVal->isNullValue())
16876
- return AtomicExpansionKind::Expand;
16868
+ case AtomicRMWInst::Xor:
16869
+ case AtomicRMWInst::Max:
16870
+ case AtomicRMWInst::Min:
16871
+ case AtomicRMWInst::UMax:
16872
+ case AtomicRMWInst::UMin:
16873
+ case AtomicRMWInst::UIncWrap:
16874
+ case AtomicRMWInst::UDecWrap: {
16875
+ if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16876
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16877
+ // On most subtargets, for atomicrmw operations other than add/xchg,
16878
+ // whether or not the instructions will behave correctly depends on where
16879
+ // the address physically resides and what interconnect is used in the
16880
+ // system configuration. On some some targets the instruction will nop,
16881
+ // and in others synchronization will only occur at degraded device scope.
16882
+ //
16883
+ // If the allocation is known local to the device, the instructions should
16884
+ // work correctly.
16885
+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
16886
+ return atomicSupportedIfLegalIntType(RMW);
16887
+
16888
+ // If fine-grained remote memory works at device scope, we don't need to
16889
+ // do anything.
16890
+ if (!HasSystemScope &&
16891
+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16892
+ return atomicSupportedIfLegalIntType(RMW);
16893
+
16894
+ // If we are targeting a remote allocated address, it depends what kind of
16895
+ // allocation the address belongs to.
16896
+ //
16897
+ // If the allocation is fine-grained (in host memory, or in PCIe peer
16898
+ // device memory), the operation will fail depending on the target.
16899
+ //
16900
+ // Note fine-grained host memory access does work on APUs or if XGMI is
16901
+ // used, but we do not know if we are targeting an APU or the system
16902
+ // configuration from the ISA version/target-cpu.
16903
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16904
+ return atomicSupportedIfLegalIntType(RMW);
16905
+
16906
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16907
+ Op == AtomicRMWInst::Xor) {
16908
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
16909
+ // does. InstCombine transforms these with 0 to or, so undo that.
16910
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16911
+ ConstVal && ConstVal->isNullValue())
16912
+ return AtomicExpansionKind::Expand;
16913
+ }
16914
+
16915
+ // If the allocation could be in remote, fine-grained memory, the rmw
16916
+ // instructions may fail. cmpxchg should work, so emit that. On some
16917
+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
16918
+ // even work, so you're out of luck anyway.
16919
+
16920
+ // In summary:
16921
+ //
16922
+ // Cases that may fail:
16923
+ // - fine-grained pinned host memory
16924
+ // - fine-grained migratable host memory
16925
+ // - fine-grained PCIe peer device
16926
+ //
16927
+ // Cases that should work, but may be treated overly conservatively.
16928
+ // - fine-grained host memory on an APU
16929
+ // - fine-grained XGMI peer device
16930
+ return AtomicExpansionKind::CmpXChg;
16877
16931
}
16878
16932
16879
16933
return atomicSupportedIfLegalIntType(RMW);
@@ -17028,19 +17082,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17028
17082
17029
17083
return AtomicExpansionKind::CmpXChg;
17030
17084
}
17031
- case AtomicRMWInst::Min:
17032
- case AtomicRMWInst::Max:
17033
- case AtomicRMWInst::UMin:
17034
- case AtomicRMWInst::UMax: {
17035
- if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17036
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17037
- // Always expand system scope min/max atomics.
17038
- if (HasSystemScope)
17039
- return AtomicExpansionKind::CmpXChg;
17040
- }
17041
-
17042
- return atomicSupportedIfLegalIntType(RMW);
17043
- }
17044
17085
case AtomicRMWInst::Nand:
17045
17086
case AtomicRMWInst::FSub:
17046
17087
default:
0 commit comments