@@ -10,21 +10,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
10
10
; CHECK: ; %bb.0: ; %entry
11
11
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
12
12
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13
- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
14
- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
15
- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
16
- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
17
- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
18
- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
19
- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
20
- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
21
- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
13
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
14
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
15
+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
16
+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
17
+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
18
+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
19
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
20
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
22
21
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23
- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
24
- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
25
- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
26
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
27
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
22
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
23
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
24
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
25
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
28
26
; CHECK-NEXT: s_endpgm
29
27
entry:
30
28
tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -176,32 +174,32 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
176
174
; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
177
175
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
178
176
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
179
- ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
180
- ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
181
- ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
182
- ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
183
- ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
184
- ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
185
- ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
186
- ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
187
- ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
188
- ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
189
- ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
190
- ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
191
- ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
192
- ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
193
- ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
194
- ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
177
+ ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
178
+ ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
179
+ ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
180
+ ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
181
+ ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
182
+ ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
183
+ ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
184
+ ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
185
+ ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
186
+ ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
187
+ ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
188
+ ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
195
189
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
196
190
; CHECK-NEXT: v_mov_b32_e32 v25, s1
197
191
; CHECK-NEXT: v_mov_b32_e32 v24, s0
198
- ; CHECK-NEXT: s_waitcnt vmcnt(20 )
192
+ ; CHECK-NEXT: s_waitcnt vmcnt(16 )
199
193
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
200
- ; CHECK-NEXT: buffer_load_dword v3 , v26, s[16:19], 0 offen offset:76
194
+ ; CHECK-NEXT: buffer_load_dword v0 , v26, s[16:19], 0 offen offset:32
201
195
; CHECK-NEXT: s_nop 0
202
- ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
203
- ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
204
- ; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
196
+ ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
197
+ ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
198
+ ; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
199
+ ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
200
+ ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
201
+ ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
202
+ ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
205
203
; CHECK-NEXT: s_waitcnt vmcnt(0)
206
204
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
207
205
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -210,11 +208,11 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
210
208
; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
211
209
; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
212
210
; CHECK-NEXT: s_nop 0
213
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
214
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:64
215
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:48
216
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:32
217
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:16
211
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
212
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
213
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
214
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:32
215
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:16
218
216
; CHECK-NEXT: s_waitcnt vmcnt(0)
219
217
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
220
218
; CHECK-NEXT: s_endpgm
@@ -276,8 +274,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
276
274
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
277
275
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
278
276
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
279
- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
280
- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
277
+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
278
+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
281
279
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
282
280
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
283
281
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -295,21 +293,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
295
293
; CHECK: ; %bb.0: ; %entry
296
294
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
297
295
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
298
- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
299
- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
300
- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
301
- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
302
- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
303
- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
304
- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
305
- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
306
- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
296
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
297
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
298
+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
299
+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
300
+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
301
+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
302
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
303
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
307
304
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308
- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
309
- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
310
- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
311
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
312
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
305
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
306
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
307
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
308
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
313
309
; CHECK-NEXT: s_endpgm
314
310
entry:
315
311
tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -461,32 +457,32 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
461
457
; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
462
458
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
463
459
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
464
- ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
465
- ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
466
- ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
467
- ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
468
- ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
469
- ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
470
- ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
471
- ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
472
- ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
473
- ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
474
- ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
475
- ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
476
- ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
477
- ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
478
- ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
479
- ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
460
+ ; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:92
461
+ ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:88
462
+ ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:84
463
+ ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:80
464
+ ; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:76
465
+ ; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:72
466
+ ; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:68
467
+ ; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:64
468
+ ; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:16
469
+ ; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:20
470
+ ; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:24
471
+ ; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:28
480
472
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
481
473
; CHECK-NEXT: v_mov_b32_e32 v25, s1
482
474
; CHECK-NEXT: v_mov_b32_e32 v24, s0
483
- ; CHECK-NEXT: s_waitcnt vmcnt(20 )
475
+ ; CHECK-NEXT: s_waitcnt vmcnt(16 )
484
476
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
485
- ; CHECK-NEXT: buffer_load_dword v3 , v26, s[16:19], 0 offen offset:76
477
+ ; CHECK-NEXT: buffer_load_dword v0 , v26, s[16:19], 0 offen offset:32
486
478
; CHECK-NEXT: s_nop 0
487
- ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
488
- ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
489
- ; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
479
+ ; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:36
480
+ ; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:40
481
+ ; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:44
482
+ ; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:48
483
+ ; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:52
484
+ ; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:56
485
+ ; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:60
490
486
; CHECK-NEXT: s_waitcnt vmcnt(0)
491
487
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
492
488
; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
@@ -495,11 +491,11 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
495
491
; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
496
492
; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
497
493
; CHECK-NEXT: s_nop 0
498
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
499
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:64
500
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:48
501
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:32
502
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:16
494
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
495
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
496
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
497
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3 ] offset:32
498
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:16
503
499
; CHECK-NEXT: s_waitcnt vmcnt(0)
504
500
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
505
501
; CHECK-NEXT: s_endpgm
@@ -561,8 +557,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
561
557
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
562
558
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
563
559
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
564
- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
565
- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
560
+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
561
+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
566
562
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
567
563
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
568
564
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
0 commit comments