Skip to content

Commit 35a98e9

Browse files
bcahoonchoikwa
andauthored
[AMDGPU] Remove scope check in SIInsertWaitcnts::generateWaitcntInstBefore (llvm#4347)
Co-authored-by: choikwa <[email protected]>
1 parent 2fed156 commit 35a98e9

File tree

2 files changed

+108
-7
lines changed

2 files changed

+108
-7
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,13 +1733,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17331733

17341734
// LOAD_CNT is only relevant to vgpr or LDS.
17351735
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1736-
// Only objects with alias scope info were added to LDSDMAScopes array.
1737-
// In the absense of the scope info we will not be able to disambiguate
1738-
// aliasing here. There is no need to try searching for a corresponding
1739-
// store slot. This is conservatively correct because in that case we
1740-
// will produce a wait using the first (general) LDS DMA wait slot which
1741-
// will wait on all of them anyway.
1742-
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1736+
if (Ptr && Memop->getAAInfo()) {
17431737
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17441738
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
17451739
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 < %s | FileCheck %s
3+
4+
declare void @llvm.amdgcn.sched.barrier(i32 %mask)
5+
declare void @llvm.amdgcn.load.to.lds(ptr %in, ptr addrspace(3) %lds_out, i32 %size, i32 %offset, i32 %aux)
6+
7+
define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr addrspace(3) %lds_buffer1, ptr addrspace(3) %lds_buffer2) #0 {
8+
; This test checks if SIInsertWaitcnts pass inserts S_WAITCNT VMCNT(0) before DS_READ
9+
; CHECK-LABEL: test_waitcnt:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0
12+
; CHECK-NEXT: s_mov_b32 s12, s8
13+
; CHECK-NEXT: s_mov_b32 s13, s9
14+
; CHECK-NEXT: s_mov_b32 s14, s10
15+
; CHECK-NEXT: s_mov_b64 s[10:11], s[6:7]
16+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
17+
; CHECK-NEXT: s_add_u32 s15, s36, 64
18+
; CHECK-NEXT: s_addc_u32 s18, s37, 0
19+
; CHECK-NEXT: s_add_u32 s8, s4, 16
20+
; CHECK-NEXT: s_addc_u32 s9, s5, 0
21+
; CHECK-NEXT: s_load_dword s6, s[36:37], 0x0
22+
; CHECK-NEXT: s_getpc_b64 s[4:5]
23+
; CHECK-NEXT: s_add_u32 s4, s4, llvm.amdgcn.load.to.lds@gotpcrel32@lo+4
24+
; CHECK-NEXT: s_addc_u32 s5, s5, llvm.amdgcn.load.to.lds@gotpcrel32@hi+12
25+
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
26+
; CHECK-NEXT: v_mov_b32_e32 v40, 0
27+
; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1]
28+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
29+
; CHECK-NEXT: v_mov_b32_e32 v41, s6
30+
; CHECK-NEXT: s_mov_b64 s[6:7], s[2:3]
31+
; CHECK-NEXT: v_mov_b32_e32 v31, v0
32+
; CHECK-NEXT: v_mov_b32_e32 v0, s15
33+
; CHECK-NEXT: v_mov_b32_e32 v1, s18
34+
; CHECK-NEXT: v_mov_b32_e32 v2, s38
35+
; CHECK-NEXT: v_mov_b32_e32 v3, 4
36+
; CHECK-NEXT: v_mov_b32_e32 v4, 4
37+
; CHECK-NEXT: v_mov_b32_e32 v5, 0
38+
; CHECK-NEXT: s_mov_b32 s32, 0
39+
; CHECK-NEXT: global_store_dword v40, v41, s[36:37] offset:64
40+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
41+
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
42+
; CHECK-NEXT: v_mov_b32_e32 v0, s38
43+
; CHECK-NEXT: v_mov_b32_e32 v1, s39
44+
; CHECK-NEXT: ds_write_b32 v0, v41
45+
; CHECK-NEXT: ds_write_b32 v1, v41
46+
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
47+
; CHECK-NEXT: ds_read_b32 v0, v0
48+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
49+
; CHECK-NEXT: global_store_dword v40, v0, s[36:37] offset:16
50+
; CHECK-NEXT: global_store_dword v40, v41, s[36:37] offset:32
51+
; CHECK-NEXT: s_endpgm
52+
entry:
53+
; VMEM accesses with alias.scope
54+
%vmem_load = load i32, ptr addrspace(1) %global_buffer
55+
%gepvmem = getelementptr i32, ptr addrspace(1) %global_buffer, i32 16
56+
store i32 %vmem_load, ptr addrspace(1) %gepvmem, align 4, !alias.scope !0
57+
58+
; Global to LDS load
59+
%gepvmem.ascast = addrspacecast ptr addrspace(1) %gepvmem to ptr
60+
call void @llvm.amdgcn.load.to.lds(ptr %gepvmem.ascast, ptr addrspace(3) %lds_buffer1, i32 4, i32 4, i32 0), !alias.scope !9, !noalias !14
61+
62+
; Insert scheduling barrier
63+
call void @llvm.amdgcn.sched.barrier(i32 0)
64+
65+
; DS_WRITEs with alias.scope and noalias
66+
store i32 %vmem_load, ptr addrspace(3) %lds_buffer1, align 4, !alias.scope !1, !noalias !12
67+
store i32 %vmem_load, ptr addrspace(3) %lds_buffer2, align 4, !alias.scope !6, !noalias !13
68+
69+
; Insert scheduling barrier
70+
call void @llvm.amdgcn.sched.barrier(i32 0)
71+
72+
; DS_READ with alias.scope missing
73+
%lds_load = load i32, ptr addrspace(3) %lds_buffer1, align 4, !noalias !12
74+
75+
; VMEM write
76+
%gep = getelementptr i32, ptr addrspace(1) %global_buffer, i32 4
77+
%gep2 = getelementptr i32, ptr addrspace(1) %global_buffer, i32 8
78+
store i32 %lds_load, ptr addrspace(1) %gep, align 4, !alias.scope !0
79+
store i32 %vmem_load, ptr addrspace(1) %gep2, align 4, !alias.scope !0
80+
81+
ret void
82+
}
83+
84+
; VMEM alias domain and scope
85+
!5 = !{!"vmem.domain"}
86+
!4 = !{!"vmem.scope", !5}
87+
!0 = !{!4}
88+
89+
; LDS alias domains and scopes
90+
!3 = !{!"lds1.domain"}
91+
!2 = !{!"lds1.scope", !3}
92+
!1 = !{!2}
93+
94+
!8 = !{!"lds2.domain"}
95+
!7 = !{!"lds2.scope", !8}
96+
!6 = !{!7}
97+
98+
!11 = !{!"lds1_off4.domain"}
99+
!10 = !{!"lds1_off4.scope", !11}
100+
!9 = !{!10}
101+
102+
; Noalias lists
103+
!12 = !{!7, !10}
104+
!13 = !{!2, !10}
105+
!14 = !{!2, !7}
106+
107+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)