llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp‎
Lines changed: 44 additions & 21 deletions b/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp‎
Lines changed: 44 additions & 21 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll‎
Lines changed: 6 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll‎
Lines changed: 6 additions & 6 deletions
@@ -1074,6 +1074,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
                                     AtomicOrdering Order) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1147,19 +1149,25 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
-  // will later use this marker to add additional waits such as those required
+  // Always emit a soft wait count at a release, even if it is trivially ~0.
+  // SIInsertWaitcnts will later add additional waits such as those required
   // from direct load to LDS (formerly known as LDS DMA).
-  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
-      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
-      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-      .addImm(WaitCntImmediate);
+  if (VMCnt || LGKMCnt ||
+      (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+        .addImm(WaitCntImmediate);
+    Changed = true;
+  }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1962,6 +1970,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2051,25 +2061,32 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
-  // will later use this marker to add additional waits such as those required
+  // Always emit a soft wait count at a release, even if it is trivially ~0.
+  // SIInsertWaitcnts will later add additional waits such as those required
   // from direct load to LDS (formerly known as LDS DMA).
-  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
-      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
-      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-      .addImm(WaitCntImmediate);
+  if (VMCnt || LGKMCnt ||
+      (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+        .addImm(WaitCntImmediate);
+    Changed = true;
+  }
 
   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
         .addImm(0);
+    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -2278,6 +2295,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2361,26 +2380,30 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
-  } else {
-    // Always emit a soft wait count, even if it is trivially ~0.
-    // SIInsertWaitcnts will later use this marker to add additional waits such
-    // as those required from direct load to LDS (formerly known as LDS DMA).
+    Changed = true;
+  } else if (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP) {
+    // Always emit a soft wait count at a release, even if it is trivially ~0.
+    // SIInsertWaitcnts will later add additional waits such as those required
+    // from direct load to LDS (formerly known as LDS DMA).
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft))
         .addImm(getLoadcntBitMask(IV));
+    Changed = true;
   }
 
   if (STORECnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
+    Changed = true;
   }
 
   if (DSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
+    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
 
@@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1