@@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
15121512; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
15131513; GFX1250: ; %bb.0: ; %bb
15141514; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1515- ; GFX1250-NEXT: s_movk_i32 s0, 0x65
1516- ; GFX1250-NEXT: s_movk_i32 s1, 0x64
1517- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1515+ ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
1516+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1517+ ; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
15181518; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
15191519; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
15201520; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
1521- ; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1522- ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1521+ ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
15231522; GFX1250-NEXT: s_clause 0x1
15241523; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
15251524; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
16191618; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
16201619; GFX1250: ; %bb.0: ; %bb
16211620; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1622- ; GFX1250-NEXT: s_mov_b64 s[0:1 ], 0x65
1623- ; GFX1250-NEXT: s_mov_b64 s[2:3 ], 0x64
1624- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1621+ ; GFX1250-NEXT: v_mov_b64_e32 v[42:43 ], 0x65
1622+ ; GFX1250-NEXT: v_mov_b64_e32 v[44:45 ], 0x64
1623+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_3) | instid1(VALU_DEP_1)
16251624; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
16261625; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
16271626; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
16281627; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1629- ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3 ], s[0:1 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1628+ ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45 ], v[42:43 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
16301629; GFX1250-NEXT: s_clause 0x1
16311630; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
16321631; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -2621,19 +2620,18 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
26212620; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
26222621; GFX1250: ; %bb.0: ; %bb
26232622; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2624- ; GFX1250-NEXT: s_movk_i32 s0, 0x65
2625- ; GFX1250-NEXT: s_movk_i32 s1, 0x64
2626- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2623+ ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
2624+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2625+ ; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
26272626; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
26282627; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
26292628; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
26302629; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
26312630; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
26322631; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
26332632; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
2634- ; GFX1250-NEXT: v_mov_b32_e32 v41, v26
26352633; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2636- ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2634+ ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
26372635; GFX1250-NEXT: s_clause 0x3
26382636; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
26392637; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
@@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
27742772; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
27752773; GFX1250: ; %bb.0: ; %bb
27762774; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2777- ; GFX1250-NEXT: s_mov_b64 s[0:1 ], 0x65
2778- ; GFX1250-NEXT: s_mov_b64 s[2:3 ], 0x64
2779- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1 )
2775+ ; GFX1250-NEXT: v_mov_b64_e32 v[42:43 ], 0x65
2776+ ; GFX1250-NEXT: v_mov_b64_e32 v[44:45 ], 0x64
2777+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3 )
27802778; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
27812779; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
27822780; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
27862784; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
27872785; GFX1250-NEXT: v_mov_b32_e32 v41, v26
27882786; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2789- ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3 ], s[0:1 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2787+ ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45 ], v[42:43 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
27902788; GFX1250-NEXT: s_clause 0x3
27912789; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
27922790; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
0 commit comments