Skip to content

Commit b6bd867

Browse files
macurtis-amddsalinas_amdeng
authored andcommitted
AMDGPU: Report unaligned scratch access as fast if supported by tgt (llvm#158036)
This enables more consecutive load folding during aggressive-instcombine. The original motivating example provided by Jeff Byrnes: https://godbolt.org/z/8ebcTEjTs Example provided by Nikita Popov: https://godbolt.org/z/Gv1j4vjqE as part of my original attempt to fix the issue (PR [llvm#133301](llvm#133301), see his [comment](llvm#133301 (comment))). This changes the value of `IsFast` returned by `In SITargetLowering::allowsMisalignedMemoryAccessesImpl` to be non-zero for private and flat addresses if the subtarget supports unaligned scratch accesses. This enables aggressive-instcombine to do more folding of consecutive loads (see [here](https://github.com/llvm/llvm-project/blob/cbd496581fb6953a9a8d8387a010cc3a67d4654b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp#L811)). Summary performance impact on [composable_kernel](https://github.com/ROCm/composable_kernel): |GPU|speedup (geomean*)| |---|---| |MI300A| 1.11| |MI300X| 1.14| |MI350X| 1.03| [*] Just to be clear, this is the geomean across kernels which were impacted by this change - not across all CK kernels.
1 parent 5065ac8 commit b6bd867

File tree

7 files changed

+594
-440
lines changed

7 files changed

+594
-440
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1932,10 +1932,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
19321932
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
19331933
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
19341934
bool AlignedBy4 = Alignment >= Align(4);
1935+
if (Subtarget->hasUnalignedScratchAccessEnabled()) {
1936+
if (IsFast)
1937+
*IsFast = AlignedBy4 ? Size : 1;
1938+
return true;
1939+
}
1940+
19351941
if (IsFast)
19361942
*IsFast = AlignedBy4;
19371943

1938-
return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1944+
return AlignedBy4;
19391945
}
19401946

19411947
// So long as they are correct, wide global memory operations perform better

llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
77
; MUBUF-LABEL: memcpy_fixed_align:
88
; MUBUF: ; %bb.0:
99
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10-
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
1110
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
1211
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
12+
; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
1313
; MUBUF-NEXT: s_lshr_b32 s4, s32, 6
1414
; MUBUF-NEXT: s_waitcnt vmcnt(2)
15-
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
16-
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
17-
; MUBUF-NEXT: s_waitcnt vmcnt(3)
1815
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
1916
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
2017
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
2118
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
22-
; MUBUF-NEXT: s_waitcnt vmcnt(6)
19+
; MUBUF-NEXT: s_waitcnt vmcnt(5)
2320
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
2421
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
2522
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
2623
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
24+
; MUBUF-NEXT: s_waitcnt vmcnt(8)
25+
; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36
26+
; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32
27+
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28
28+
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24
2729
; MUBUF-NEXT: ;;#ASMSTART
2830
; MUBUF-NEXT: ; use s4
2931
; MUBUF-NEXT: ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
3537
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3638
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
3739
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
38-
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
40+
; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
3941
; FLATSCR-NEXT: s_mov_b32 s0, s32
4042
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
4143
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
4244
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
4345
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
4446
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
45-
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
47+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24
4648
; FLATSCR-NEXT: ;;#ASMSTART
4749
; FLATSCR-NEXT: ; use s0
4850
; FLATSCR-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll

Lines changed: 74 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
1212
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
1313
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
1414
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15-
; CHECK-NEXT: v_mov_b32_e32 v12, s3
16-
; CHECK-NEXT: v_mov_b32_e32 v11, s2
17-
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
18-
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
19-
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
20-
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
21-
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
22-
; CHECK-NEXT: v_mov_b32_e32 v12, s1
23-
; CHECK-NEXT: v_mov_b32_e32 v11, s0
15+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
16+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
17+
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
18+
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
19+
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
20+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
21+
; CHECK-NEXT: v_mov_b32_e32 v9, s1
22+
; CHECK-NEXT: v_mov_b32_e32 v8, s0
2423
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25-
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
26-
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
27-
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
28-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
29-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
24+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
25+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
26+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
27+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
3028
; CHECK-NEXT: s_endpgm
3129
entry:
3230
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
173171
; CHECK-NEXT: v_mov_b32_e32 v26, s0
174172
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
175173
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
176-
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
177-
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
178174
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
179175
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
176+
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
180177
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
178+
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
181179
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
182180
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
183-
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
184-
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
185-
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
186-
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
187-
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
188-
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
189-
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
190-
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
191-
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
192-
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
193-
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
194-
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
195-
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
196-
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
197-
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
198-
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
181+
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
182+
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
183+
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
184+
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
185+
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
186+
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
187+
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
188+
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
189+
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
190+
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
191+
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
192+
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
193+
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
194+
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
195+
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
196+
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
199197
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
200198
; CHECK-NEXT: v_mov_b32_e32 v25, s1
201199
; CHECK-NEXT: v_mov_b32_e32 v24, s0
202-
; CHECK-NEXT: s_waitcnt vmcnt(18)
200+
; CHECK-NEXT: s_waitcnt vmcnt(20)
203201
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
204202
; CHECK-NEXT: s_waitcnt vmcnt(0)
205203
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
213211
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
214212
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
215213
; CHECK-NEXT: s_nop 0
216-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
217-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
218-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
219-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
214+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
215+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
216+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
217+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
220218
; CHECK-NEXT: s_waitcnt vmcnt(0)
221219
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
222220
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
281279
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
282280
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
283281
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
284-
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
285-
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
282+
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
283+
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
286284
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
287285
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
288286
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
302300
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
303301
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
304302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
305-
; CHECK-NEXT: v_mov_b32_e32 v12, s3
306-
; CHECK-NEXT: v_mov_b32_e32 v11, s2
307-
; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
308-
; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
309-
; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
310-
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
311-
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
312-
; CHECK-NEXT: v_mov_b32_e32 v12, s1
313-
; CHECK-NEXT: v_mov_b32_e32 v11, s0
303+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
304+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
305+
; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
306+
; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
307+
; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
308+
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
309+
; CHECK-NEXT: v_mov_b32_e32 v9, s1
310+
; CHECK-NEXT: v_mov_b32_e32 v8, s0
314311
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
315-
; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
316-
; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
317-
; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
318-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
319-
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
312+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
313+
; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
314+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
315+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
320316
; CHECK-NEXT: s_endpgm
321317
entry:
322318
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
463459
; CHECK-NEXT: v_mov_b32_e32 v26, s0
464460
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
465461
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
466-
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
467-
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
468462
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
469463
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
464+
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
470465
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
466+
; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
471467
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
472468
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
473-
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
474-
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
475-
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
476-
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
477-
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
478-
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
479-
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
480-
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
481-
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
482-
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
483-
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
484-
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
485-
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
486-
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
487-
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
488-
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
469+
; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
470+
; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
471+
; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
472+
; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
473+
; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
474+
; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
475+
; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
476+
; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
477+
; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
478+
; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
479+
; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
480+
; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
481+
; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
482+
; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
483+
; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
484+
; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
489485
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
490486
; CHECK-NEXT: v_mov_b32_e32 v25, s1
491487
; CHECK-NEXT: v_mov_b32_e32 v24, s0
492-
; CHECK-NEXT: s_waitcnt vmcnt(18)
488+
; CHECK-NEXT: s_waitcnt vmcnt(20)
493489
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
494490
; CHECK-NEXT: s_waitcnt vmcnt(0)
495491
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
503499
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
504500
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
505501
; CHECK-NEXT: s_nop 0
506-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
507-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
508-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
509-
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
502+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
503+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
504+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
505+
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
510506
; CHECK-NEXT: s_waitcnt vmcnt(0)
511507
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
512508
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
571567
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
572568
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
573569
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
574-
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
575-
; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
570+
; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
571+
; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
576572
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
577573
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
578574
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64

0 commit comments

Comments
 (0)