@@ -17300,26 +17300,80 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17300
17300
17301
17301
auto Op = RMW->getOperation();
17302
17302
switch (Op) {
17303
- case AtomicRMWInst::Xchg: {
17303
+ case AtomicRMWInst::Xchg:
17304
17304
// PCIe supports add and xchg for system atomics.
17305
17305
return isAtomicRMWLegalXChgTy(RMW)
17306
17306
? TargetLowering::AtomicExpansionKind::None
17307
17307
: TargetLowering::AtomicExpansionKind::CmpXChg;
17308
- }
17309
17308
case AtomicRMWInst::Add:
17310
- case AtomicRMWInst::And:
17311
- case AtomicRMWInst::UIncWrap:
17312
- case AtomicRMWInst::UDecWrap:
17309
+ // PCIe supports add and xchg for system atomics.
17313
17310
return atomicSupportedIfLegalIntType(RMW);
17314
17311
case AtomicRMWInst::Sub:
17312
+ case AtomicRMWInst::And:
17315
17313
case AtomicRMWInst::Or:
17316
- case AtomicRMWInst::Xor: {
17317
- // Atomic sub/or/xor do not work over PCI express, but atomic add
17318
- // does. InstCombine transforms these with 0 to or, so undo that.
17319
- if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17320
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17321
- ConstVal && ConstVal->isNullValue())
17322
- return AtomicExpansionKind::Expand;
17314
+ case AtomicRMWInst::Xor:
17315
+ case AtomicRMWInst::Max:
17316
+ case AtomicRMWInst::Min:
17317
+ case AtomicRMWInst::UMax:
17318
+ case AtomicRMWInst::UMin:
17319
+ case AtomicRMWInst::UIncWrap:
17320
+ case AtomicRMWInst::UDecWrap: {
17321
+ if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17322
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17323
+ // On most subtargets, for atomicrmw operations other than add/xchg,
17324
+ // whether or not the instructions will behave correctly depends on where
17325
+ // the address physically resides and what interconnect is used in the
17326
+ // system configuration. On some some targets the instruction will nop,
17327
+ // and in others synchronization will only occur at degraded device scope.
17328
+ //
17329
+ // If the allocation is known local to the device, the instructions should
17330
+ // work correctly.
17331
+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17332
+ return atomicSupportedIfLegalIntType(RMW);
17333
+
17334
+ // If fine-grained remote memory works at device scope, we don't need to
17335
+ // do anything.
17336
+ if (!HasSystemScope &&
17337
+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17338
+ return atomicSupportedIfLegalIntType(RMW);
17339
+
17340
+ // If we are targeting a remote allocated address, it depends what kind of
17341
+ // allocation the address belongs to.
17342
+ //
17343
+ // If the allocation is fine-grained (in host memory, or in PCIe peer
17344
+ // device memory), the operation will fail depending on the target.
17345
+ //
17346
+ // Note fine-grained host memory access does work on APUs or if XGMI is
17347
+ // used, but we do not know if we are targeting an APU or the system
17348
+ // configuration from the ISA version/target-cpu.
17349
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17350
+ return atomicSupportedIfLegalIntType(RMW);
17351
+
17352
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17353
+ Op == AtomicRMWInst::Xor) {
17354
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
17355
+ // does. InstCombine transforms these with 0 to or, so undo that.
17356
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17357
+ ConstVal && ConstVal->isNullValue())
17358
+ return AtomicExpansionKind::Expand;
17359
+ }
17360
+
17361
+ // If the allocation could be in remote, fine-grained memory, the rmw
17362
+ // instructions may fail. cmpxchg should work, so emit that. On some
17363
+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
17364
+ // even work, so you're out of luck anyway.
17365
+
17366
+ // In summary:
17367
+ //
17368
+ // Cases that may fail:
17369
+ // - fine-grained pinned host memory
17370
+ // - fine-grained migratable host memory
17371
+ // - fine-grained PCIe peer device
17372
+ //
17373
+ // Cases that should work, but may be treated overly conservatively.
17374
+ // - fine-grained host memory on an APU
17375
+ // - fine-grained XGMI peer device
17376
+ return AtomicExpansionKind::CmpXChg;
17323
17377
}
17324
17378
17325
17379
return atomicSupportedIfLegalIntType(RMW);
@@ -17474,19 +17528,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17474
17528
17475
17529
return AtomicExpansionKind::CmpXChg;
17476
17530
}
17477
- case AtomicRMWInst::Min:
17478
- case AtomicRMWInst::Max:
17479
- case AtomicRMWInst::UMin:
17480
- case AtomicRMWInst::UMax: {
17481
- if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17482
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17483
- // Always expand system scope min/max atomics.
17484
- if (HasSystemScope)
17485
- return AtomicExpansionKind::CmpXChg;
17486
- }
17487
-
17488
- return atomicSupportedIfLegalIntType(RMW);
17489
- }
17490
17531
case AtomicRMWInst::Nand:
17491
17532
case AtomicRMWInst::FSub:
17492
17533
default:
0 commit comments