@@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
1616; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
1717; GCN-NEXT: s_mov_b64 s[36:37], 1
1818; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
19- ; GCN-NEXT: s_mov_b32 s36 , 2
20- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
19+ ; GCN-NEXT: s_mov_b32 s38 , 2
20+ ; GCN-NEXT: s_mov_b32 s39, s37
2121; GCN-NEXT: s_waitcnt lgkmcnt(0)
2222; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
2323; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
24+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
2425; GCN-NEXT: s_waitcnt lgkmcnt(0)
2526; GCN-NEXT: v_accvgpr_write_b32 a0, s0
27+ ; GCN-NEXT: v_accvgpr_write_b32 a16, s16
2628; GCN-NEXT: v_accvgpr_write_b32 a1, s1
2729; GCN-NEXT: v_accvgpr_write_b32 a2, s2
2830; GCN-NEXT: v_accvgpr_write_b32 a3, s3
@@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
3840; GCN-NEXT: v_accvgpr_write_b32 a13, s13
3941; GCN-NEXT: v_accvgpr_write_b32 a14, s14
4042; GCN-NEXT: v_accvgpr_write_b32 a15, s15
41- ; GCN-NEXT: v_accvgpr_write_b32 a16, s16
4243; GCN-NEXT: v_accvgpr_write_b32 a17, s17
4344; GCN-NEXT: v_accvgpr_write_b32 a18, s18
4445; GCN-NEXT: v_accvgpr_write_b32 a19, s19
@@ -317,31 +318,29 @@ bb:
317318define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm (ptr addrspace (1 ) %arg , double %a , double %b ) #0 {
318319; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
319320; GCN: ; %bb.0: ; %bb
320- ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
321- ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
322- ; GCN-NEXT: s_mov_b64 s[0:1], 0
321+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
322+ ; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34
323323; GCN-NEXT: s_mov_b64 s[6:7], 1.0
324- ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
324+ ; GCN-NEXT: s_mov_b64 s[8:9], 0
325+ ; GCN-NEXT: v_accvgpr_write_b32 a0, s8
325326; GCN-NEXT: s_waitcnt lgkmcnt(0)
326- ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
327- ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
328- ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
329- ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
330- ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
331- ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
332- ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
333- ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
327+ ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
328+ ; GCN-NEXT: v_accvgpr_write_b32 a2, s8
329+ ; GCN-NEXT: v_accvgpr_write_b32 a4, s8
334330; GCN-NEXT: v_accvgpr_write_b32 a6, s6
331+ ; GCN-NEXT: v_accvgpr_write_b32 a1, s9
332+ ; GCN-NEXT: v_accvgpr_write_b32 a3, s9
333+ ; GCN-NEXT: v_accvgpr_write_b32 a5, s9
335334; GCN-NEXT: v_accvgpr_write_b32 a7, s7
336- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13 ], s[12:13 ] op_sel:[0,1]
335+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11 ], s[10:11 ] op_sel:[0,1]
337336; GCN-NEXT: s_nop 1
338337; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339338; GCN-NEXT: v_mov_b32_e32 v0, 0
340339; GCN-NEXT: s_nop 7
341340; GCN-NEXT: s_nop 7
342341; GCN-NEXT: s_nop 0
343- ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9 ]
344- ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9 ] offset:16
342+ ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1 ]
343+ ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1 ] offset:16
345344; GCN-NEXT: s_endpgm
346345bb:
347346 %mai.1 = tail call <4 x double > @llvm.amdgcn.mfma.f64.16x16x4f64 (double %a , double %b , <4 x double > <double 0 .0 , double 0 .0 , double 0 .0 , double 1 .0 >, i32 0 , i32 0 , i32 0 )
@@ -352,32 +351,29 @@ bb:
352351define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit (ptr addrspace (1 ) %arg , double %a , double %b ) #0 {
353352; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
354353; GCN: ; %bb.0: ; %bb
355- ; GCN-NEXT: s_load_dwordx4 s[8:11 ], s[4:5], 0x24
356- ; GCN-NEXT: s_load_dwordx2 s[12:13 ], s[4:5], 0x34
357- ; GCN-NEXT: s_mov_b32 s0 , 0
358- ; GCN-NEXT: s_mov_b32 s1 , 0x405ec000
359- ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
354+ ; GCN-NEXT: s_load_dwordx4 s[0:3 ], s[4:5], 0x24
355+ ; GCN-NEXT: s_load_dwordx2 s[8:9 ], s[4:5], 0x34
356+ ; GCN-NEXT: s_mov_b32 s6 , 0
357+ ; GCN-NEXT: s_mov_b32 s7 , 0x405ec000
358+ ; GCN-NEXT: v_accvgpr_write_b32 a0, s6
360359; GCN-NEXT: s_waitcnt lgkmcnt(0)
361- ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
362- ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
363- ; GCN-NEXT: s_mov_b64 s[6:7], s[0:1]
364- ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
365- ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
366- ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
367- ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
368- ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
369- ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
360+ ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
361+ ; GCN-NEXT: v_accvgpr_write_b32 a2, s6
362+ ; GCN-NEXT: v_accvgpr_write_b32 a4, s6
370363; GCN-NEXT: v_accvgpr_write_b32 a6, s6
364+ ; GCN-NEXT: v_accvgpr_write_b32 a1, s7
365+ ; GCN-NEXT: v_accvgpr_write_b32 a3, s7
366+ ; GCN-NEXT: v_accvgpr_write_b32 a5, s7
371367; GCN-NEXT: v_accvgpr_write_b32 a7, s7
372- ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13 ], s[12:13 ] op_sel:[0,1]
368+ ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9 ], s[8:9 ] op_sel:[0,1]
373369; GCN-NEXT: s_nop 1
374370; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375371; GCN-NEXT: v_mov_b32_e32 v0, 0
376372; GCN-NEXT: s_nop 7
377373; GCN-NEXT: s_nop 7
378374; GCN-NEXT: s_nop 0
379- ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9 ]
380- ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9 ] offset:16
375+ ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1 ]
376+ ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1 ] offset:16
381377; GCN-NEXT: s_endpgm
382378bb:
383379 %mai.1 = tail call <4 x double > @llvm.amdgcn.mfma.f64.16x16x4f64 (double %a , double %b , <4 x double > <double 123 .0 , double 123 .0 , double 123 .0 , double 123 .0 >, i32 0 , i32 0 , i32 0 )
0 commit comments