@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
1212; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
1313; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
1414; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
16- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
17- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
18- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
19- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
20- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
21- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
22- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
23- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
15+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
16+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
17+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
18+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
19+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
20+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
21+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
22+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
2423; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
26- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
27- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
28- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
29- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
24+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
25+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
26+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
27+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
3028; CHECK-NEXT: s_endpgm
3129entry:
3230 tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
173171; CHECK-NEXT: v_mov_b32_e32 v26, s0
174172; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
175173; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
176- ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
177- ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
178174; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
179175; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
176+ ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
180177; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
178+ ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
181179; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
182180; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
183- ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:32
184- ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:36
185- ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:40
186- ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:44
187- ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:48
188- ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:52
189- ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:56
190- ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:60
191- ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:68
192- ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:76
193- ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:84
194- ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:92
195- ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:88
196- ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:80
197- ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:72
198- ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:64
181+ ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:92
182+ ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:88
183+ ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:84
184+ ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:80
185+ ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:76
186+ ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:72
187+ ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:68
188+ ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:64
189+ ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:32
190+ ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:36
191+ ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:40
192+ ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:44
193+ ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:48
194+ ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:52
195+ ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:56
196+ ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:60
199197; CHECK-NEXT: s_waitcnt lgkmcnt(0)
200198; CHECK-NEXT: v_mov_b32_e32 v25, s1
201199; CHECK-NEXT: v_mov_b32_e32 v24, s0
202- ; CHECK-NEXT: s_waitcnt vmcnt(18 )
200+ ; CHECK-NEXT: s_waitcnt vmcnt(20 )
203201; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
204202; CHECK-NEXT: s_waitcnt vmcnt(0)
205203; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
213211; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
214212; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
215213; CHECK-NEXT: s_nop 0
216- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
217- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:64
218- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:48
219- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:32
214+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
215+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
216+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
217+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:32
220218; CHECK-NEXT: s_waitcnt vmcnt(0)
221219; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
222220; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
281279; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
282280; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
283281; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
284- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
285- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
282+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
283+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
286284; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
287285; CHECK-NEXT: s_waitcnt lgkmcnt(0)
288286; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
302300; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
303301; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
304302; CHECK-NEXT: s_waitcnt lgkmcnt(0)
305- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
306- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
307- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
308- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
309- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
310- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
311- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
312- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
313- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
303+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
304+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
305+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
306+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
307+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
308+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
309+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
310+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
314311; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
315- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
316- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
317- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
318- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
319- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
312+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
313+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
314+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
315+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
320316; CHECK-NEXT: s_endpgm
321317entry:
322318 tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
463459; CHECK-NEXT: v_mov_b32_e32 v26, s0
464460; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
465461; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
466- ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
467- ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
468462; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
469463; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
464+ ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
470465; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
466+ ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
471467; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
472468; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
473- ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:32
474- ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:36
475- ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:40
476- ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:44
477- ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:48
478- ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:52
479- ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:56
480- ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:60
481- ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:68
482- ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:76
483- ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:84
484- ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:92
485- ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:88
486- ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:80
487- ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:72
488- ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:64
469+ ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:92
470+ ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:88
471+ ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:84
472+ ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:80
473+ ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:76
474+ ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:72
475+ ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:68
476+ ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:64
477+ ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:32
478+ ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:36
479+ ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:40
480+ ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:44
481+ ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:48
482+ ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:52
483+ ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:56
484+ ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:60
489485; CHECK-NEXT: s_waitcnt lgkmcnt(0)
490486; CHECK-NEXT: v_mov_b32_e32 v25, s1
491487; CHECK-NEXT: v_mov_b32_e32 v24, s0
492- ; CHECK-NEXT: s_waitcnt vmcnt(18 )
488+ ; CHECK-NEXT: s_waitcnt vmcnt(20 )
493489; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
494490; CHECK-NEXT: s_waitcnt vmcnt(0)
495491; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
503499; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
504500; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
505501; CHECK-NEXT: s_nop 0
506- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
507- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:64
508- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:48
509- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:32
502+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
503+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
504+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
505+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:32
510506; CHECK-NEXT: s_waitcnt vmcnt(0)
511507; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
512508; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
571567; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
572568; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
573569; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
574- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
575- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
570+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
571+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
576572; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
577573; CHECK-NEXT: s_waitcnt lgkmcnt(0)
578574; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
0 commit comments