@@ -809,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
809
809
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
810
810
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
811
811
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
812
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5 ], v6, v4, v[8:9]
813
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3 ], v7, v3, v[4:5 ]
812
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11 ], v6, v4, v[8:9]
813
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5 ], v7, v3, v[10:11 ]
814
814
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
815
- ; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
815
+ ; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
816
816
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
817
817
%result = mul i96 %num , %den
818
818
ret i96 %result
@@ -1218,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
1218
1218
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
1219
1219
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
1220
1220
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1221
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11 ], v2, v4, v[10:11]
1222
- ; GFX1250-NEXT: v_mov_b32_e32 v12 , v1
1221
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13 ], v2, v4, v[10:11]
1222
+ ; GFX1250-NEXT: v_mov_b32_e32 v10 , v1
1223
1223
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
1224
1224
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1225
- ; GFX1250-NEXT: v_mov_b32_e32 v13, v10
1226
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13 ], vcc_lo, v8, v5, v[12:13 ]
1225
+ ; GFX1250-NEXT: v_mov_b32_e32 v11, v12
1226
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15 ], vcc_lo, v8, v5, v[10:11 ]
1227
1227
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
1228
1228
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1229
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13 ]
1230
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11 , v8, s0
1229
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15 ]
1230
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13 , v8, s0
1231
1231
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1232
1232
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
1233
1233
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2874,86 +2874,87 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2874
2874
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2875
2875
; GFX1250-NEXT: s_wait_kmcnt 0x0
2876
2876
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
2877
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
2878
- ; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
2879
- ; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
2880
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2881
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
2882
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
2877
+ ; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
2878
+ ; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
2879
+ ; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
2880
+ ; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
2881
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v1, v13, v[16:17]
2882
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v12, 0
2883
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2884
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
2885
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[16:17]
2883
2886
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2884
- ; GFX1250-NEXT: v_cndmask_b32_e64 v20 , 0, 1, s0
2885
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12 , v[16:17 ]
2887
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v22 , 0, 1, s0
2888
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11 , v[20:21 ]
2886
2889
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2887
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2888
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2889
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
2890
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2891
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
2892
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2890
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
2891
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
2892
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2893
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[16:17]
2894
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v3, v9, v[20:21]
2893
2895
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2894
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v24 , null, 0, v22, vcc_lo
2895
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17 ], v4, v10 , v[16:17 ]
2896
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2897
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2898
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2899
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2900
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
2901
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23 ], v6, v8, v[16:17 ]
2902
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21 ]
2903
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4 )
2904
- ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2905
- ; GFX1250-NEXT: v_mul_lo_u32 v22 , v6, v9
2896
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26 , null, 0, v22, vcc_lo
2897
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21 ], v5, v9 , v[18:19 ]
2898
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v10, 0
2899
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2900
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[16:17]
2901
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
2902
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2903
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25 ], v6, v8, v[20:21 ]
2904
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[18:19 ]
2905
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3 )
2906
+ ; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
2907
+ ; GFX1250-NEXT: v_mul_lo_u32 v24 , v6, v9
2906
2908
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2907
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25 ], s0, v2, v8 , v[16:17 ]
2908
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2909
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
2909
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21 ], vcc_lo, v0, v13 , v[18:19 ]
2910
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[16:17]
2911
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2910
2912
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
2911
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2912
2913
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
2913
- ; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
2914
- ; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2915
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2916
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
2917
- ; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2914
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
2915
+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2916
+ ; GFX1250-NEXT: v_mov_b32_e32 v13, v18
2917
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2918
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v0, v11, v[20:21]
2918
2919
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2920
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
2919
2921
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
2920
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2921
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
2922
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
2923
- ; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
2924
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2925
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2926
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
2927
- ; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2928
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2929
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
2930
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
2931
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
2922
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2923
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v1, v10, v[22:23]
2924
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
2925
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
2926
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2927
+ ; GFX1250-NEXT: v_mov_b32_e32 v12, v17
2928
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
2929
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2930
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v0, v9, v[12:13]
2931
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
2932
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v33, s2
2932
2933
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
2933
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
2934
- ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
2935
2934
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
2936
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2935
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
2936
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
2937
2937
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
2938
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
2938
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
2939
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[20:21]
2939
2940
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2940
2941
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
2941
2942
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
2942
2943
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2943
2944
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2944
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26 , v11, s2
2945
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28 , v11, s2
2945
2946
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2946
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23 , v0, s2
2947
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v25 , v0, s2
2947
2948
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
2948
2949
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
2949
2950
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2950
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20 , s4
2951
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29 , s3
2951
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v32 , s4
2952
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v31 , s3
2952
2953
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2953
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25 , s1
2954
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27 , s0
2954
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v30 , s1
2955
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29 , s0
2955
2956
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2956
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22 , vcc_lo
2957
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v24 , vcc_lo
2957
2958
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
2958
2959
; GFX1250-NEXT: v_mov_b32_e32 v0, v16
2959
2960
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -3018,9 +3019,9 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
3018
3019
;
3019
3020
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
3020
3021
; GFX1250: ; %bb.0:
3021
- ; GFX1250-NEXT: global_load_b32 v2 , v[2:3], off
3022
+ ; GFX1250-NEXT: global_load_b32 v4 , v[2:3], off
3022
3023
; GFX1250-NEXT: s_wait_loadcnt 0x0
3023
- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2 , 0
3024
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4 , 0
3024
3025
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
3025
3026
; GFX1250-NEXT: s_endpgm
3026
3027
%val = load i32 , ptr addrspace (1 ) %in , align 4
@@ -3212,9 +3213,9 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
3212
3213
;
3213
3214
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
3214
3215
; GFX1250: ; %bb.0:
3215
- ; GFX1250-NEXT: global_load_b32 v2 , v[2:3], off
3216
+ ; GFX1250-NEXT: global_load_b32 v4 , v[2:3], off
3216
3217
; GFX1250-NEXT: s_wait_loadcnt 0x0
3217
- ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2 , 0
3218
+ ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4 , 0
3218
3219
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
3219
3220
; GFX1250-NEXT: s_endpgm
3220
3221
%val = load i32 , ptr addrspace (1 ) %in , align 4
0 commit comments