@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
12
12
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
13
13
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
14
14
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15
- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
16
- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
17
- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
18
- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
19
- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
20
- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
21
- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
22
- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
23
- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
15
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
16
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
17
+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
18
+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
19
+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
20
+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
21
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
22
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
24
23
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25
- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
26
- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
27
- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
28
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
29
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
24
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
25
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
26
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
27
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
30
28
; CHECK-NEXT: s_endpgm
31
29
entry:
32
30
tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
173
171
; CHECK-NEXT: v_mov_b32_e32 v26, s0
174
172
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
175
173
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
176
- ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
177
- ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
178
174
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
179
175
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
176
+ ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
180
177
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
178
+ ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
181
179
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
182
180
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
183
- ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:32
184
- ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:36
185
- ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:40
186
- ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:44
187
- ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:48
188
- ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:52
189
- ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:56
190
- ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:60
191
- ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:68
192
- ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:76
193
- ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:84
194
- ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:92
195
- ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:88
196
- ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:80
197
- ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:72
198
- ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:64
181
+ ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:92
182
+ ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:88
183
+ ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:84
184
+ ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:80
185
+ ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:76
186
+ ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:72
187
+ ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:68
188
+ ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:64
189
+ ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:32
190
+ ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:36
191
+ ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:40
192
+ ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:44
193
+ ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:48
194
+ ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:52
195
+ ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:56
196
+ ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:60
199
197
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
200
198
; CHECK-NEXT: v_mov_b32_e32 v25, s1
201
199
; CHECK-NEXT: v_mov_b32_e32 v24, s0
202
- ; CHECK-NEXT: s_waitcnt vmcnt(18 )
200
+ ; CHECK-NEXT: s_waitcnt vmcnt(20 )
203
201
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
204
202
; CHECK-NEXT: s_waitcnt vmcnt(0)
205
203
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
213
211
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
214
212
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
215
213
; CHECK-NEXT: s_nop 0
216
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
217
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:64
218
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:48
219
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:32
214
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
215
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
216
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
217
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:32
220
218
; CHECK-NEXT: s_waitcnt vmcnt(0)
221
219
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
222
220
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
281
279
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
282
280
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
283
281
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
284
- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
285
- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
282
+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
283
+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
286
284
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
287
285
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
288
286
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
302
300
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
303
301
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
304
302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
305
- ; CHECK-NEXT: v_mov_b32_e32 v12, s3
306
- ; CHECK-NEXT: v_mov_b32_e32 v11, s2
307
- ; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
308
- ; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
309
- ; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
310
- ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
311
- ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
312
- ; CHECK-NEXT: v_mov_b32_e32 v12, s1
313
- ; CHECK-NEXT: v_mov_b32_e32 v11, s0
303
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
304
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
305
+ ; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
306
+ ; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
307
+ ; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
308
+ ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
309
+ ; CHECK-NEXT: v_mov_b32_e32 v9, s1
310
+ ; CHECK-NEXT: v_mov_b32_e32 v8, s0
314
311
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
315
- ; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
316
- ; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
317
- ; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
318
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
319
- ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
312
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
313
+ ; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
314
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
315
+ ; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
320
316
; CHECK-NEXT: s_endpgm
321
317
entry:
322
318
tail call void @llvm.memcpy.p0.p0.i64 (ptr %dest , ptr %src , i64 47 , i1 false )
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
463
459
; CHECK-NEXT: v_mov_b32_e32 v26, s0
464
460
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
465
461
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
466
- ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
467
- ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
468
462
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
469
463
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
464
+ ; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
470
465
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
466
+ ; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
471
467
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
472
468
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
473
- ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:32
474
- ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:36
475
- ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:40
476
- ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:44
477
- ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:48
478
- ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:52
479
- ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:56
480
- ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:60
481
- ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:68
482
- ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:76
483
- ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:84
484
- ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:92
485
- ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:88
486
- ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:80
487
- ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:72
488
- ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:64
469
+ ; CHECK-NEXT: buffer_load_dword v11 , v26, s[20:23], 0 offen offset:92
470
+ ; CHECK-NEXT: buffer_load_dword v10 , v26, s[20:23], 0 offen offset:88
471
+ ; CHECK-NEXT: buffer_load_dword v9 , v26, s[20:23], 0 offen offset:84
472
+ ; CHECK-NEXT: buffer_load_dword v8 , v26, s[20:23], 0 offen offset:80
473
+ ; CHECK-NEXT: buffer_load_dword v15 , v26, s[20:23], 0 offen offset:76
474
+ ; CHECK-NEXT: buffer_load_dword v14 , v26, s[20:23], 0 offen offset:72
475
+ ; CHECK-NEXT: buffer_load_dword v13 , v26, s[20:23], 0 offen offset:68
476
+ ; CHECK-NEXT: buffer_load_dword v12 , v26, s[20:23], 0 offen offset:64
477
+ ; CHECK-NEXT: buffer_load_dword v16 , v26, s[20:23], 0 offen offset:32
478
+ ; CHECK-NEXT: buffer_load_dword v17 , v26, s[20:23], 0 offen offset:36
479
+ ; CHECK-NEXT: buffer_load_dword v18 , v26, s[20:23], 0 offen offset:40
480
+ ; CHECK-NEXT: buffer_load_dword v19 , v26, s[20:23], 0 offen offset:44
481
+ ; CHECK-NEXT: buffer_load_dword v20 , v26, s[20:23], 0 offen offset:48
482
+ ; CHECK-NEXT: buffer_load_dword v21 , v26, s[20:23], 0 offen offset:52
483
+ ; CHECK-NEXT: buffer_load_dword v22 , v26, s[20:23], 0 offen offset:56
484
+ ; CHECK-NEXT: buffer_load_dword v23 , v26, s[20:23], 0 offen offset:60
489
485
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
490
486
; CHECK-NEXT: v_mov_b32_e32 v25, s1
491
487
; CHECK-NEXT: v_mov_b32_e32 v24, s0
492
- ; CHECK-NEXT: s_waitcnt vmcnt(18 )
488
+ ; CHECK-NEXT: s_waitcnt vmcnt(20 )
493
489
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
494
490
; CHECK-NEXT: s_waitcnt vmcnt(0)
495
491
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
503
499
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
504
500
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
505
501
; CHECK-NEXT: s_nop 0
506
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:80
507
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:64
508
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:48
509
- ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:32
502
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11 ] offset:80
503
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15 ] offset:64
504
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23 ] offset:48
505
+ ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19 ] offset:32
510
506
; CHECK-NEXT: s_waitcnt vmcnt(0)
511
507
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
512
508
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
571
567
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
572
568
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
573
569
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
574
- ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
575
- ; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
570
+ ; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
571
+ ; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
576
572
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
577
573
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
578
574
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
0 commit comments