@@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
1512
1512
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
1513
1513
; GFX1250: ; %bb.0: ; %bb
1514
1514
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1515
- ; GFX1250-NEXT: s_movk_i32 s0, 0x65
1516
- ; GFX1250-NEXT: s_movk_i32 s1, 0x64
1517
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1515
+ ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
1516
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1517
+ ; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
1518
1518
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
1519
1519
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
1520
1520
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
1521
- ; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1522
- ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1521
+ ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1523
1522
; GFX1250-NEXT: s_clause 0x1
1524
1523
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
1525
1524
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
1619
1618
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
1620
1619
; GFX1250: ; %bb.0: ; %bb
1621
1620
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
1622
- ; GFX1250-NEXT: s_mov_b64 s[0:1 ], 0x65
1623
- ; GFX1250-NEXT: s_mov_b64 s[2:3 ], 0x64
1624
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1621
+ ; GFX1250-NEXT: v_mov_b64_e32 v[42:43 ], 0x65
1622
+ ; GFX1250-NEXT: v_mov_b64_e32 v[44:45 ], 0x64
1623
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1625
1624
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
1626
1625
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
1627
1626
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
1628
1627
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
1629
- ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3 ], s[0:1 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1628
+ ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45 ], v[42:43 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
1630
1629
; GFX1250-NEXT: s_clause 0x1
1631
1630
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
1632
1631
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -2621,19 +2620,18 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
2621
2620
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
2622
2621
; GFX1250: ; %bb.0: ; %bb
2623
2622
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2624
- ; GFX1250-NEXT: s_movk_i32 s0, 0x65
2625
- ; GFX1250-NEXT: s_movk_i32 s1, 0x64
2626
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2623
+ ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
2624
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2625
+ ; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
2627
2626
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
2628
2627
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
2629
2628
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
2630
2629
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
2631
2630
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
2632
2631
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
2633
2632
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
2634
- ; GFX1250-NEXT: v_mov_b32_e32 v41, v26
2635
2633
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2636
- ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2634
+ ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2637
2635
; GFX1250-NEXT: s_clause 0x3
2638
2636
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
2639
2637
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
@@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
2774
2772
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
2775
2773
; GFX1250: ; %bb.0: ; %bb
2776
2774
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
2777
- ; GFX1250-NEXT: s_mov_b64 s[0:1 ], 0x65
2778
- ; GFX1250-NEXT: s_mov_b64 s[2:3 ], 0x64
2779
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1 )
2775
+ ; GFX1250-NEXT: v_mov_b64_e32 v[42:43 ], 0x65
2776
+ ; GFX1250-NEXT: v_mov_b64_e32 v[44:45 ], 0x64
2777
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3 )
2780
2778
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
2781
2779
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
2782
2780
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
2786
2784
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
2787
2785
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
2788
2786
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
2789
- ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3 ], s[0:1 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2787
+ ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45 ], v[42:43 ] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
2790
2788
; GFX1250-NEXT: s_clause 0x3
2791
2789
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
2792
2790
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
0 commit comments