Skip to content

Commit d97055b

Browse files
committed
[AMDGPU] Create new option for force flush load counter
In ceratin situations it is beneficial to wait for all outstanding loads regardless of specific load's data we need. This may allow to reduce a number of cache requests. Fixes: SWDEV-511507
1 parent 804b81d commit d97055b

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ static cl::opt<bool>
5353
"s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
5454
cl::init(false), cl::Hidden);
5555

56+
static cl::opt<bool> ForceEmitZeroLoadFlag(
57+
"amdgpu-waitcnt-load-forcezero",
58+
cl::desc("Force all waitcnt load counters to wait until 0"),
59+
cl::init(false), cl::Hidden);
60+
5661
namespace {
5762
// Class of object that encapsulates latest instruction counter score
5863
// associated with the operand. Used for determining whether
@@ -1850,6 +1855,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18501855
Wait.BvhCnt = 0;
18511856
}
18521857

1858+
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
1859+
Wait.LoadCnt = 0;
1860+
18531861
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
18541862
OldWaitcntInstr);
18551863
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=DEFAULT %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-waitcnt-load-forcezero < %s | FileCheck --check-prefixes=LDZERO %s
4+
5+
define amdgpu_kernel void @copy(ptr addrspace(1) noalias nocapture readonly %src1, ptr addrspace(1) noalias nocapture readonly %src2, ptr addrspace(1) noalias nocapture writeonly %dst1, ptr addrspace(1) noalias nocapture writeonly %dst2) {
6+
; DEFAULT-LABEL: copy:
7+
; DEFAULT: ; %bb.0:
8+
; DEFAULT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
9+
; DEFAULT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
10+
; DEFAULT-NEXT: s_delay_alu instid0(VALU_DEP_1)
11+
; DEFAULT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
13+
; DEFAULT-NEXT: s_clause 0x1
14+
; DEFAULT-NEXT: global_load_b32 v1, v0, s[0:1]
15+
; DEFAULT-NEXT: global_load_b32 v2, v0, s[2:3]
16+
; DEFAULT-NEXT: s_waitcnt vmcnt(1)
17+
; DEFAULT-NEXT: global_store_b32 v0, v1, s[4:5]
18+
; DEFAULT-NEXT: s_waitcnt vmcnt(0)
19+
; DEFAULT-NEXT: global_store_b32 v0, v2, s[6:7]
20+
; DEFAULT-NEXT: s_endpgm
21+
;
22+
; LDZERO-LABEL: copy:
23+
; LDZERO: ; %bb.0:
24+
; LDZERO-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
25+
; LDZERO-NEXT: v_and_b32_e32 v0, 0x3ff, v0
26+
; LDZERO-NEXT: s_delay_alu instid0(VALU_DEP_1)
27+
; LDZERO-NEXT: v_lshlrev_b32_e32 v0, 2, v0
28+
; LDZERO-NEXT: s_waitcnt lgkmcnt(0)
29+
; LDZERO-NEXT: s_clause 0x1
30+
; LDZERO-NEXT: global_load_b32 v1, v0, s[0:1]
31+
; LDZERO-NEXT: global_load_b32 v2, v0, s[2:3]
32+
; LDZERO-NEXT: s_waitcnt vmcnt(0)
33+
; LDZERO-NEXT: s_clause 0x1
34+
; LDZERO-NEXT: global_store_b32 v0, v1, s[4:5]
35+
; LDZERO-NEXT: global_store_b32 v0, v2, s[6:7]
36+
; LDZERO-NEXT: s_endpgm
37+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
38+
%idx = zext i32 %id to i64
39+
%gep.ld1 = getelementptr inbounds nuw float, ptr addrspace(1) %src1, i64 %idx
40+
%v1 = load float, ptr addrspace(1) %gep.ld1, align 4
41+
%gep.ld2 = getelementptr inbounds nuw float, ptr addrspace(1) %src2, i64 %idx
42+
%v2 = load float, ptr addrspace(1) %gep.ld2, align 4
43+
%gep.st1 = getelementptr inbounds nuw float, ptr addrspace(1) %dst1, i64 %idx
44+
store float %v1, ptr addrspace(1) %gep.st1, align 4
45+
%gep.st2 = getelementptr inbounds nuw float, ptr addrspace(1) %dst2, i64 %idx
46+
store float %v2, ptr addrspace(1) %gep.st2, align 4
47+
ret void
48+
}

0 commit comments

Comments
 (0)