Skip to content

Commit 380087e

Browse files
committed
[AMDGPU] Add test with redundant copies to temporary stack slot produced by expandUnalignedLoad
Differential Revision: https://reviews.llvm.org/D88895
1 parent a96bcfb commit 380087e

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
3+
4+
; Test that checks for redundant copies to temporary stack slot produced by
5+
; expandUnalignedLoad.
6+
7+
define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) {
8+
; CHECK-LABEL: test:
9+
; CHECK: ; %bb.0:
10+
; CHECK-NEXT: s_mov_b32 s8, s4
11+
; CHECK-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
12+
; CHECK-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
13+
; CHECK-NEXT: s_mov_b32 s6, -1
14+
; CHECK-NEXT: s_mov_b32 s7, 0xe8f000
15+
; CHECK-NEXT: s_add_u32 s4, s4, s8
16+
; CHECK-NEXT: s_addc_u32 s5, s5, 0
17+
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0
18+
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 12, v0
19+
; CHECK-NEXT: s_mov_b32 m0, -1
20+
; CHECK-NEXT: ds_read_b32 v1, v1
21+
; CHECK-NEXT: ds_read_b32 v2, v2
22+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 4, v0
23+
; CHECK-NEXT: ds_read_b32 v3, v3
24+
; CHECK-NEXT: ds_read_b32 v0, v0
25+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
26+
; CHECK-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28
27+
; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24
28+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
29+
; CHECK-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20
30+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
31+
; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16
32+
; CHECK-NEXT: s_waitcnt expcnt(1)
33+
; CHECK-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28
34+
; CHECK-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24
35+
; CHECK-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20
36+
; CHECK-NEXT: s_waitcnt expcnt(0)
37+
; CHECK-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
38+
; CHECK-NEXT: s_waitcnt vmcnt(0)
39+
; CHECK-NEXT: exp mrt0 off, off, off, off
40+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
41+
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
42+
; CHECK-NEXT: s_endpgm
43+
call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
44+
%var1 = load <6 x float>, <6 x float> addrspace(3)* %arg2, align 4
45+
%var2 = shufflevector <6 x float> %var1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
46+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, <4 x i32> %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0)
47+
ret void
48+
}
49+
50+
define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) {
51+
; CHECK-LABEL: test_2:
52+
; CHECK: ; %bb.0:
53+
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
54+
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
55+
; CHECK-NEXT: s_mov_b32 s10, -1
56+
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
57+
; CHECK-NEXT: s_add_u32 s8, s8, s5
58+
; CHECK-NEXT: s_addc_u32 s9, s9, 0
59+
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1
60+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 28, v1
61+
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1
62+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1
63+
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1
64+
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 12, v1
65+
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1
66+
; CHECK-NEXT: s_mov_b32 m0, -1
67+
; CHECK-NEXT: ds_read_b32 v4, v2
68+
; CHECK-NEXT: ds_read_b32 v5, v3
69+
; CHECK-NEXT: ds_read_b32 v2, v6
70+
; CHECK-NEXT: ds_read_b32 v3, v7
71+
; CHECK-NEXT: ds_read_b32 v8, v8
72+
; CHECK-NEXT: ds_read_b32 v9, v9
73+
; CHECK-NEXT: ds_read_b32 v7, v10
74+
; CHECK-NEXT: ds_read_b32 v6, v1
75+
; CHECK-NEXT: s_waitcnt lgkmcnt(6)
76+
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:28
77+
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:24
78+
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
79+
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
80+
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
81+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
82+
; CHECK-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:44
83+
; CHECK-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:40
84+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
85+
; CHECK-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:36
86+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
87+
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:32
88+
; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
89+
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
90+
; CHECK-NEXT: s_endpgm
91+
%load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4
92+
%vec1 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
93+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, <4 x i32> %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3)
94+
%vec2 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
95+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec2, <4 x i32> %arg1, i32 %arg2, i32 16, i32 %arg3, i32 immarg 77, i32 immarg 3)
96+
ret void
97+
}
98+
99+
define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) {
100+
; CHECK-LABEL: test_3:
101+
; CHECK: ; %bb.0:
102+
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
103+
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
104+
; CHECK-NEXT: s_mov_b32 s10, -1
105+
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
106+
; CHECK-NEXT: s_add_u32 s8, s8, s6
107+
; CHECK-NEXT: s_addc_u32 s9, s9, 0
108+
; CHECK-NEXT: s_mov_b32 s7, s5
109+
; CHECK-NEXT: s_mov_b32 s6, s4
110+
; CHECK-NEXT: s_mov_b32 s5, s3
111+
; CHECK-NEXT: s_mov_b32 s4, s2
112+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 8, v1
113+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v1
114+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
115+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
116+
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
117+
; CHECK-NEXT: v_mov_b32_e32 v9, s0
118+
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 8, v2
119+
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
120+
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 4, v2
121+
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 16, v2
122+
; CHECK-NEXT: v_add_i32_e32 v14, vcc, 20, v2
123+
; CHECK-NEXT: s_mov_b32 m0, -1
124+
; CHECK-NEXT: ds_read_b32 v5, v0
125+
; CHECK-NEXT: ds_read_b32 v6, v3
126+
; CHECK-NEXT: ds_read_b32 v4, v4
127+
; CHECK-NEXT: ds_read_b32 v8, v8
128+
; CHECK-NEXT: ds_read_b32 v7, v7
129+
; CHECK-NEXT: ds_read_b32 v3, v1
130+
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
131+
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44
132+
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40
133+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
134+
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36
135+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
136+
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32
137+
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
138+
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
139+
; CHECK-NEXT: ds_read_b32 v0, v10
140+
; CHECK-NEXT: ds_read_b32 v1, v11
141+
; CHECK-NEXT: s_waitcnt expcnt(1)
142+
; CHECK-NEXT: ds_read_b32 v3, v12
143+
; CHECK-NEXT: ds_read_b32 v4, v13
144+
; CHECK-NEXT: ds_read_b32 v2, v2
145+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
146+
; CHECK-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28
147+
; CHECK-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24
148+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
149+
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
150+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
151+
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
152+
; CHECK-NEXT: s_waitcnt expcnt(1)
153+
; CHECK-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28
154+
; CHECK-NEXT: s_waitcnt expcnt(0)
155+
; CHECK-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24
156+
; CHECK-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20
157+
; CHECK-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16
158+
; CHECK-NEXT: ds_read_b32 v5, v14
159+
; CHECK-NEXT: s_waitcnt vmcnt(0)
160+
; CHECK-NEXT: exp mrt0 off, off, off, off
161+
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
162+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
163+
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
164+
; CHECK-NEXT: s_endpgm
165+
%load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4
166+
%vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
167+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec11, <4 x i32> %arg3, i32 %arg1, i32 264, i32 %arg2, i32 immarg 77, i32 immarg 3)
168+
%vec12 = shufflevector <6 x float> %load1, <6 x float> undef, <2 x i32> <i32 4, i32 5>
169+
call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec12, <4 x i32> %arg3, i32 %arg1, i32 280, i32 %arg2, i32 immarg 64, i32 immarg 3)
170+
171+
call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
172+
173+
%load2 = load <6 x float>, <6 x float> addrspace(3)* %arg6, align 4
174+
%vec21 = shufflevector <6 x float> %load2, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
175+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec21, <4 x i32> %arg3, i32 %arg1, i32 240, i32 %arg2, i32 immarg 77, i32 immarg 3)
176+
%vec22 = shufflevector <6 x float> %load2, <6 x float> undef, <2 x i32> <i32 4, i32 5>
177+
call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec22, <4 x i32> %arg3, i32 %arg1, i32 256, i32 %arg2, i32 immarg 64, i32 immarg 3)
178+
179+
ret void
180+
}
181+
182+
declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg)
183+
declare void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg)
184+
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg)

0 commit comments

Comments
 (0)