@@ -10,21 +10,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
1010; CHECK: ; %bb.0: ; %entry
1111; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1212; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
14- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
15- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
16- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
17- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
18- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
19- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
20- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
21- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
13+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
14+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
15+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
16+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
17+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
18+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
19+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
20+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
2221; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
24- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
25- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
26- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
27- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
22+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
23+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
24+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
25+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
2826; CHECK-NEXT: s_endpgm
2927entry:
3028 tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -176,32 +174,32 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
176174; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
177175; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
178176; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
179- ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
180- ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
181- ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
182- ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
183- ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
184- ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
185- ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
186- ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
187- ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
188- ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
189- ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
190- ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
191- ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
192- ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
193- ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
194- ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
177+ ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
178+ ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
179+ ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
180+ ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
181+ ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
182+ ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
183+ ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
184+ ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
185+ ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
186+ ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
187+ ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
188+ ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
195189; CHECK-NEXT: s_waitcnt lgkmcnt(0)
196190; CHECK-NEXT: v_mov_b32_e32 v25, s1
197191; CHECK-NEXT: v_mov_b32_e32 v24, s0
198- ; CHECK-NEXT: s_waitcnt vmcnt(20 )
192+ ; CHECK-NEXT: s_waitcnt vmcnt(16 )
199193; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
200- ; CHECK-NEXT: buffer_load_dword v3 , v26, s[16:19], 0 offen offset:76
194+ ; CHECK-NEXT: buffer_load_dword v0 , v26, s[16:19], 0 offen offset:32
201195; CHECK-NEXT: s_nop 0
202- ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
203- ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
204- ; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
196+ ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
197+ ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
198+ ; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
199+ ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
200+ ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
201+ ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
202+ ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
205203; CHECK-NEXT: s_waitcnt vmcnt(0)
206204; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
207205; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -210,11 +208,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
210208; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
211209; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
212210; CHECK-NEXT: s_nop 0
213- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
214- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:64
215- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:48
216- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:32
217- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:16
211+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
212+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
213+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
214+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:32
215+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:16
218216; CHECK-NEXT: s_waitcnt vmcnt(0)
219217; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
220218; CHECK-NEXT: s_endpgm
@@ -276,8 +274,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
276274; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
277275; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
278276; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
279- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
280- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
277+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
278+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
281279; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
282280; CHECK-NEXT: s_waitcnt lgkmcnt(0)
283281; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -295,21 +293,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
295293; CHECK: ; %bb.0: ; %entry
296294; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
297295; CHECK-NEXT: s_waitcnt lgkmcnt(0)
298- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
299- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
300- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
301- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
302- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
303- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
304- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
305- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
306- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
296+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
297+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
298+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
299+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
300+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
301+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
302+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
303+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
307304; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
309- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
310- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
311- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
312- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
305+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
306+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
307+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
308+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
313309; CHECK-NEXT: s_endpgm
314310entry:
315311 tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -461,32 +457,32 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
461457; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
462458; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
463459; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
464- ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
465- ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
466- ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
467- ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
468- ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
469- ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
470- ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
471- ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
472- ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
473- ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
474- ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
475- ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
476- ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
477- ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
478- ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
479- ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
460+ ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
461+ ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
462+ ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
463+ ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
464+ ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
465+ ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
466+ ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
467+ ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
468+ ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
469+ ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
470+ ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
471+ ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
480472; CHECK-NEXT: s_waitcnt lgkmcnt(0)
481473; CHECK-NEXT: v_mov_b32_e32 v25, s1
482474; CHECK-NEXT: v_mov_b32_e32 v24, s0
483- ; CHECK-NEXT: s_waitcnt vmcnt(20 )
475+ ; CHECK-NEXT: s_waitcnt vmcnt(16 )
484476; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
485- ; CHECK-NEXT: buffer_load_dword v3 , v26, s[16:19], 0 offen offset:76
477+ ; CHECK-NEXT: buffer_load_dword v0 , v26, s[16:19], 0 offen offset:32
486478; CHECK-NEXT: s_nop 0
487- ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
488- ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
489- ; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
479+ ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
480+ ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
481+ ; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
482+ ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
483+ ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
484+ ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
485+ ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
490486; CHECK-NEXT: s_waitcnt vmcnt(0)
491487; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
492488; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -495,11 +491,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
495491; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
496492; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
497493; CHECK-NEXT: s_nop 0
498- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
499- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:64
500- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:48
501- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:32
502- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:16
494+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
495+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
496+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
497+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:32
498+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:16
503499; CHECK-NEXT: s_waitcnt vmcnt(0)
504500; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
505501; CHECK-NEXT: s_endpgm
@@ -561,8 +557,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
561557; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
562558; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
563559; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
564- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
565- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
560+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
561+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
566562; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
567563; CHECK-NEXT: s_waitcnt lgkmcnt(0)
568564; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
0 commit comments