Skip to content

Commit 9e611ac

Browse files
macurtis-amdronlieb
authored andcommitted
AMDGPU: Report unaligned scratch access as fast if supported by tgt (llvm#158036)
This enables more consecutive load folding during aggressive-instcombine. The original motivating example provided by Jeff Byrnes: https://godbolt.org/z/8ebcTEjTs Example provided by Nikita Popov: https://godbolt.org/z/Gv1j4vjqE as part of my original attempt to fix the issue (PR [llvm#133301](llvm#133301), see his [comment](llvm#133301 (comment))). This changes the value of `IsFast` returned by `In SITargetLowering::allowsMisalignedMemoryAccessesImpl` to be non-zero for private and flat addresses if the subtarget supports unaligned scratch accesses. This enables aggressive-instcombine to do more folding of consecutive loads (see [here](https://github.com/llvm/llvm-project/blob/cbd496581fb6953a9a8d8387a010cc3a67d4654b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp#L811)). Summary performance impact on [composable_kernel](https://github.com/ROCm/composable_kernel): |GPU|speedup (geomean*)| |---|---| |MI300A| 1.11| |MI300X| 1.14| |MI350X| 1.03| [*] Just to be clear, this is the geomean across kernels which were impacted by this change - not across all CK kernels.
1 parent d948fad commit 9e611ac

File tree

7 files changed

+602
-448
lines changed

7 files changed

+602
-448
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1874,10 +1874,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
18741874
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
18751875
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
18761876
bool AlignedBy4 = Alignment >= Align(4);
1877+
if (Subtarget->hasUnalignedScratchAccessEnabled()) {
1878+
if (IsFast)
1879+
*IsFast = AlignedBy4 ? Size : 1;
1880+
return true;
1881+
}
1882+
18771883
if (IsFast)
18781884
*IsFast = AlignedBy4;
18791885

1880-
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1886+
return AlignedBy4;
18811887
}
18821888

18831889
// So long as they are correct, wide global memory operations perform better

llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
77
; MUBUF-LABEL: memcpy_fixed_align:
88
; MUBUF: ; %bb.0:
99
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10-
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
1110
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
1211
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
12+
; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
1313
; MUBUF-NEXT: s_lshr_b32 s4, s32, 6
1414
; MUBUF-NEXT: s_waitcnt vmcnt(2)
15-
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
16-
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
17-
; MUBUF-NEXT: s_waitcnt vmcnt(3)
1815
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
1916
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
2017
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
2118
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
22-
; MUBUF-NEXT: s_waitcnt vmcnt(6)
19+
; MUBUF-NEXT: s_waitcnt vmcnt(5)
2320
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
2421
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
2522
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
2623
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
24+
; MUBUF-NEXT: s_waitcnt vmcnt(8)
25+
; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36
26+
; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32
27+
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28
28+
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24
2729
; MUBUF-NEXT: ;;#ASMSTART
2830
; MUBUF-NEXT: ; use s4
2931
; MUBUF-NEXT: ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
3537
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3638
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
3739
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
38-
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
40+
; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
3941
; FLATSCR-NEXT: s_mov_b32 s0, s32
4042
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
4143
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
4244
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
4345
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
4446
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
45-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
47+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24
4648
; FLATSCR-NEXT: ;;#ASMSTART
4749
; FLATSCR-NEXT: ; use s0
4850
; FLATSCR-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll

Lines changed: 80 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
1010
; CHECK: ; %bb.0: ; %entry
1111
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1212
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13-
; CHECK-NEXT: v_mov_b32_e32 v12, s3
14-
; CHECK-NEXT: v_mov_b32_e32 v11, s2
15-
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
16-
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
17-
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
18-
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
19-
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
20-
; CHECK-NEXT: v_mov_b32_e32 v12, s1
21-
; CHECK-NEXT: v_mov_b32_e32 v11, s0
13+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
14+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
15+
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
16+
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
17+
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
18+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
19+
; CHECK-NEXT: v_mov_b32_e32 v9, s1
20+
; CHECK-NEXT: v_mov_b32_e32 v8, s0
2221
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23-
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
24-
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
25-
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
26-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
27-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
22+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
23+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
24+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
25+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
2826
; CHECK-NEXT: s_endpgm
2927
entry:
3028
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -176,32 +174,32 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
176174
; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
177175
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
178176
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
179-
; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
180-
; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
181-
; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
182-
; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
183-
; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
184-
; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
185-
; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
186-
; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
187-
; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
188-
; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
189-
; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
190-
; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
191-
; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
192-
; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
193-
; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
194-
; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
177+
; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
178+
; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
179+
; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
180+
; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
181+
; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
182+
; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
183+
; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
184+
; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
185+
; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
186+
; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
187+
; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
188+
; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
195189
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
196190
; CHECK-NEXT: v_mov_b32_e32 v25, s1
197191
; CHECK-NEXT: v_mov_b32_e32 v24, s0
198-
; CHECK-NEXT: s_waitcnt vmcnt(20)
192+
; CHECK-NEXT: s_waitcnt vmcnt(16)
199193
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
200-
; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
194+
; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:32
201195
; CHECK-NEXT: s_nop 0
202-
; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
203-
; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
204-
; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
196+
; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
197+
; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
198+
; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
199+
; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
200+
; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
201+
; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
202+
; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
205203
; CHECK-NEXT: s_waitcnt vmcnt(0)
206204
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
207205
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -210,11 +208,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
210208
; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
211209
; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
212210
; CHECK-NEXT: s_nop 0
213-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
214-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
215-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48
216-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32
217-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16
211+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
212+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
213+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
214+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:32
215+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:16
218216
; CHECK-NEXT: s_waitcnt vmcnt(0)
219217
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
220218
; CHECK-NEXT: s_endpgm
@@ -276,8 +274,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
276274
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
277275
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
278276
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
279-
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
280-
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
277+
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
278+
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
281279
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
282280
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
283281
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -295,21 +293,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
295293
; CHECK: ; %bb.0: ; %entry
296294
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
297295
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
298-
; CHECK-NEXT: v_mov_b32_e32 v12, s3
299-
; CHECK-NEXT: v_mov_b32_e32 v11, s2
300-
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
301-
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
302-
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
303-
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
304-
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
305-
; CHECK-NEXT: v_mov_b32_e32 v12, s1
306-
; CHECK-NEXT: v_mov_b32_e32 v11, s0
296+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
297+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
298+
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
299+
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
300+
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
301+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
302+
; CHECK-NEXT: v_mov_b32_e32 v9, s1
303+
; CHECK-NEXT: v_mov_b32_e32 v8, s0
307304
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308-
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
309-
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
310-
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
311-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
312-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
305+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
306+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
307+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
308+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
313309
; CHECK-NEXT: s_endpgm
314310
entry:
315311
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -461,32 +457,32 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
461457
; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
462458
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
463459
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
464-
; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
465-
; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
466-
; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
467-
; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
468-
; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
469-
; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
470-
; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
471-
; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
472-
; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
473-
; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
474-
; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
475-
; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
476-
; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
477-
; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
478-
; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
479-
; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
460+
; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
461+
; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
462+
; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
463+
; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
464+
; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
465+
; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
466+
; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
467+
; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
468+
; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
469+
; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
470+
; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
471+
; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
480472
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
481473
; CHECK-NEXT: v_mov_b32_e32 v25, s1
482474
; CHECK-NEXT: v_mov_b32_e32 v24, s0
483-
; CHECK-NEXT: s_waitcnt vmcnt(20)
475+
; CHECK-NEXT: s_waitcnt vmcnt(16)
484476
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
485-
; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
477+
; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:32
486478
; CHECK-NEXT: s_nop 0
487-
; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
488-
; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
489-
; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
479+
; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
480+
; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
481+
; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
482+
; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
483+
; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
484+
; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
485+
; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
490486
; CHECK-NEXT: s_waitcnt vmcnt(0)
491487
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
492488
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -495,11 +491,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
495491
; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
496492
; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
497493
; CHECK-NEXT: s_nop 0
498-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
499-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
500-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48
501-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32
502-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16
494+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
495+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
496+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
497+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:32
498+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:16
503499
; CHECK-NEXT: s_waitcnt vmcnt(0)
504500
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
505501
; CHECK-NEXT: s_endpgm
@@ -561,8 +557,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
561557
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
562558
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
563559
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
564-
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
565-
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
560+
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
561+
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
566562
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
567563
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
568564
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64

0 commit comments

Comments
 (0)