Skip to content

Commit 84b433f

Browse files
committed
add waitcnt only for release at workgroup or larger scope
1 parent e6831c6 commit 84b433f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+264
-8958
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
10741074
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
10751075
bool IsCrossAddrSpaceOrdering, Position Pos,
10761076
AtomicOrdering Order) const {
1077+
bool Changed = false;
1078+
10771079
MachineBasicBlock &MBB = *MI->getParent();
10781080
DebugLoc DL = MI->getDebugLoc();
10791081

@@ -1147,19 +1149,25 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11471149
}
11481150
}
11491151

1150-
// Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
1151-
// will later use this marker to add additional waits such as those required
1152+
// Always emit a soft wait count at a release, even if it is trivially ~0.
1153+
// SIInsertWaitcnts will later add additional waits such as those required
11521154
// from direct load to LDS (formerly known as LDS DMA).
1153-
unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
1154-
IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
1155-
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1156-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1157-
.addImm(WaitCntImmediate);
1155+
if (VMCnt || LGKMCnt ||
1156+
(isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) {
1157+
unsigned WaitCntImmediate =
1158+
AMDGPU::encodeWaitcnt(IV,
1159+
VMCnt ? 0 : getVmcntBitMask(IV),
1160+
getExpcntBitMask(IV),
1161+
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1162+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1163+
.addImm(WaitCntImmediate);
1164+
Changed = true;
1165+
}
11581166

11591167
if (Pos == Position::AFTER)
11601168
--MI;
11611169

1162-
return true;
1170+
return Changed;
11631171
}
11641172

11651173
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1962,6 +1970,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19621970
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
19631971
bool IsCrossAddrSpaceOrdering,
19641972
Position Pos, AtomicOrdering Order) const {
1973+
bool Changed = false;
1974+
19651975
MachineBasicBlock &MBB = *MI->getParent();
19661976
DebugLoc DL = MI->getDebugLoc();
19671977

@@ -2051,25 +2061,32 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20512061
}
20522062
}
20532063

2054-
// Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
2055-
// will later use this marker to add additional waits such as those required
2064+
// Always emit a soft wait count at a release, even if it is trivially ~0.
2065+
// SIInsertWaitcnts will later add additional waits such as those required
20562066
// from direct load to LDS (formerly known as LDS DMA).
2057-
unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
2058-
IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
2059-
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2060-
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2061-
.addImm(WaitCntImmediate);
2067+
if (VMCnt || LGKMCnt ||
2068+
(isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) {
2069+
unsigned WaitCntImmediate =
2070+
AMDGPU::encodeWaitcnt(IV,
2071+
VMCnt ? 0 : getVmcntBitMask(IV),
2072+
getExpcntBitMask(IV),
2073+
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2074+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2075+
.addImm(WaitCntImmediate);
2076+
Changed = true;
2077+
}
20622078

20632079
if (VSCnt) {
20642080
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
20652081
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
20662082
.addImm(0);
2083+
Changed = true;
20672084
}
20682085

20692086
if (Pos == Position::AFTER)
20702087
--MI;
20712088

2072-
return true;
2089+
return Changed;
20732090
}
20742091

20752092
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -2278,6 +2295,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
22782295
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
22792296
bool IsCrossAddrSpaceOrdering,
22802297
Position Pos, AtomicOrdering Order) const {
2298+
bool Changed = false;
2299+
22812300
MachineBasicBlock &MBB = *MI->getParent();
22822301
DebugLoc DL = MI->getDebugLoc();
22832302

@@ -2361,26 +2380,30 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23612380
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
23622381
}
23632382
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2364-
} else {
2365-
// Always emit a soft wait count, even if it is trivially ~0.
2366-
// SIInsertWaitcnts will later use this marker to add additional waits such
2367-
// as those required from direct load to LDS (formerly known as LDS DMA).
2383+
Changed = true;
2384+
} else if (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP) {
2385+
// Always emit a soft wait count at a release, even if it is trivially ~0.
2386+
// SIInsertWaitcnts will later add additional waits such as those required
2387+
// from direct load to LDS (formerly known as LDS DMA).
23682388
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft))
23692389
.addImm(getLoadcntBitMask(IV));
2390+
Changed = true;
23702391
}
23712392

23722393
if (STORECnt) {
23732394
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2395+
Changed = true;
23742396
}
23752397

23762398
if (DSCnt) {
23772399
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2400+
Changed = true;
23782401
}
23792402

23802403
if (Pos == Position::AFTER)
23812404
--MI;
23822405

2383-
return true;
2406+
return Changed;
23842407
}
23852408

23862409
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
880880
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
881881
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
882882
; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
883-
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
884883
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
884+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
885885
; GFX10-NEXT: s_lshl_b32 s0, s0, 7
886886
; GFX10-NEXT: s_add_u32 s0, 0x100, s0
887887
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
921921
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
922922
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
923923
; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc
924-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
925924
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
925+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
926926
; GFX11-NEXT: s_lshl_b32 s0, s0, 7
927927
; GFX11-NEXT: s_add_u32 s0, 0x100, s0
928928
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
991991
; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
992992
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
993993
; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
994-
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
995994
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
995+
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
996996
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7
997997
; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x100, s0
998998
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
10321032
; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
10331033
; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
10341034
; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc
1035-
; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0)
10361035
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1036+
; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0)
10371037
; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7
10381038
; UNALIGNED_GFX11-NEXT: s_add_u32 s0, 0x100, s0
10391039
; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
15201520
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
15211521
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
15221522
; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
1523-
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
15241523
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1524+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
15251525
; GFX10-NEXT: s_lshl_b32 s0, s0, 7
15261526
; GFX10-NEXT: s_add_u32 s0, 0x4004, s0
15271527
; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
16331633
; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
16341634
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
16351635
; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128
1636-
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
16371636
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1637+
; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0)
16381638
; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7
16391639
; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x4004, s0
16401640
; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1

0 commit comments

Comments
 (0)