@@ -106,6 +106,7 @@ class SIMemOpInfo final {
106
106
bool IsLastUse = false ;
107
107
bool IsCooperative = false ;
108
108
109
+ // TODO: Should we assume Cooperative=true if no MMO is present?
109
110
SIMemOpInfo (
110
111
const GCNSubtarget &ST,
111
112
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -338,6 +339,11 @@ class SICacheControl {
338
339
bool IsNonTemporal,
339
340
bool IsLastUse = false ) const = 0;
340
341
342
+ // / Add final touches to a `mayStore` instruction \p MI, which may be a
343
+ // / Store or RMW instruction.
344
+ // / FIXME: This takes a MI because iterators aren't handled properly. When
345
+ // / this is called, they often point to entirely different insts. Thus we back
346
+ // / up the inst early and pass it here instead.
341
347
virtual bool finalizeStore (MachineInstr &MI, bool Atomic) const {
342
348
return false ;
343
349
};
@@ -2381,7 +2387,10 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2381
2387
// which shares the same L0.
2382
2388
//
2383
2389
// GFX12.5:
2384
- // TODO DOCS
2390
+ // CU$ has two ports. To ensure operations are visible at the workgroup
2391
+ // level, we need to ensure all operations in this port have completed
2392
+ // so the other SIMDs in the WG can see them. There is no ordering
2393
+ // guarantee between the ports.
2385
2394
if (!ST.isCuModeEnabled () || ST.hasGFX1250Insts ()) {
2386
2395
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2387
2396
LOADCnt |= true ;
@@ -2496,8 +2505,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2496
2505
// Otherwise in CU mode all waves of a work-group are on the same CU, and
2497
2506
// so the L0 does not need to be invalidated.
2498
2507
//
2499
- // GFX12.5
2500
- // TODO DOCS
2508
+ // GFX12.5 has a shared WGP$, so no invalidates are required.
2501
2509
if (ST.isCuModeEnabled ())
2502
2510
return false ;
2503
2511
@@ -2541,7 +2549,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2541
2549
++MI;
2542
2550
2543
2551
// global_wb is only necessary at system scope for GFX12.0,
2544
- // they're also necessary at device scope for GFX12.5.
2552
+ // they're also necessary at device scope for GFX12.5 as stores
2553
+ // cannot report completion earlier than L2.
2545
2554
//
2546
2555
// Emitting it for lower scopes is a slow no-op, so we omit it
2547
2556
// for performance.
@@ -2552,7 +2561,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2552
2561
Changed = true ;
2553
2562
break ;
2554
2563
case SIAtomicScope::AGENT:
2555
- // TODO DOCS
2564
+ // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2556
2565
if (ST.hasGFX1250Insts ()) {
2557
2566
BuildMI (MBB, MI, DL, TII->get (AMDGPU::GLOBAL_WB))
2558
2567
.addImm (AMDGPU::CPol::SCOPE_DEV);
0 commit comments