@@ -801,15 +801,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
801
801
; GFX1250-NEXT: s_wait_kmcnt 0x0
802
802
; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
803
803
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
804
- ; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
805
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
806
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v6, v3, 0
807
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1 )
808
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
809
- ; GFX1250-NEXT: v_dual_mov_b32 v10 , v1 :: v_dual_mov_b32 v11, v8
804
+ ; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4
805
+ ; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0
806
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0
807
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2 )
808
+ ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
809
+ ; GFX1250-NEXT: v_mov_b32_e32 v8 , v1
810
810
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
811
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11 ]
812
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null , v7, v3, v[4:5]
811
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9 ]
812
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
813
813
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
814
814
; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
815
815
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -1206,11 +1206,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
1206
1206
; GFX1250-NEXT: s_wait_kmcnt 0x0
1207
1207
; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
1208
1208
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1209
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v8, v6, 0
1210
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null , v9, v5, v[0:1]
1211
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v8, v4, 0
1209
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0
1210
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
1211
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
1212
1212
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1213
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null , v2, v4, v[10:11]
1213
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
1214
1214
; GFX1250-NEXT: v_mov_b32_e32 v12, v1
1215
1215
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
1216
1216
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1220,15 +1220,13 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
1220
1220
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1221
1221
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
1222
1222
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
1223
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1224
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
1225
- ; GFX1250-NEXT: v_mov_b32_e32 v1, v6
1226
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1227
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
1223
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1224
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
1225
+ ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
1226
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
1228
1227
; GFX1250-NEXT: v_mov_b32_e32 v2, v7
1229
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
1230
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1231
- ; GFX1250-NEXT: v_mov_b32_e32 v3, v4
1228
+ ; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1
1229
+ ; GFX1250-NEXT: v_mov_b32_e32 v1, v6
1232
1230
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
1233
1231
%result = mul i128 %num , %den
1234
1232
ret i128 %result
@@ -2856,90 +2854,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
2856
2854
; GFX1250: ; %bb.0:
2857
2855
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
2858
2856
; GFX1250-NEXT: s_wait_kmcnt 0x0
2859
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v0, v14, 0
2860
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null , v0, v12, 0
2861
- ; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
2857
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
2858
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
2859
+ ; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
2862
2860
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
2863
2861
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2864
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v1, v13, v[16:17]
2862
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
2865
2863
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
2866
2864
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2867
2865
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2868
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v2, v12, v[16:17]
2866
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
2869
2867
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2870
2868
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2871
2869
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2872
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null , v0, v10, 0
2870
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
2873
2871
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2874
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v3, v11, v[16:17]
2872
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
2875
2873
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2876
2874
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2877
2875
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2878
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v4, v10, v[16:17]
2879
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2 )
2876
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
2877
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1 )
2880
2878
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2881
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
2882
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
2883
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
2879
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2880
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2881
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
2882
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
2884
2883
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
2885
- ; GFX1250-NEXT: v_mov_b32_e32 v20, v19
2886
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
2887
- ; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
2888
- ; GFX1250-NEXT: v_mov_b32_e32 v21, v22
2889
- ; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
2890
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
2891
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2892
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v27 , null, 0, v19, vcc_lo
2893
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
2894
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2895
- ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
2884
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2885
+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2886
+ ; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
2887
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2888
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
2889
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2890
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
2891
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6 , null, 0, v6, s0
2892
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2893
+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
2894
+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
2896
2895
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2897
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
2898
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2896
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2899
2897
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
2900
2898
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2901
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2902
2899
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2900
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
2901
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2903
2902
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
2904
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
2905
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
2906
2903
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
2907
2904
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
2908
2905
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2906
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2909
2907
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
2910
2908
; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2911
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
2912
2909
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2910
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
2913
2911
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
2914
2912
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
2915
2913
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
2916
2914
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
2917
2915
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
2918
2916
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
2919
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
2920
2917
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2918
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
2921
2919
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
2922
2920
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2923
2921
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
2924
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27 , v13, s2
2922
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6 , v13, s2
2925
2923
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2926
2924
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2927
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6 , v11, s2
2928
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT ) | instid1(VALU_DEP_1 )
2925
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26 , v11, s2
2926
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
2929
2927
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
2928
+ ; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
2930
2929
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
2931
2930
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2932
2931
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
2933
2932
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
2934
2933
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2935
2934
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
2936
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2935
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
2937
2936
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2938
- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
2939
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
2940
- ; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
2941
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2942
- ; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
2937
+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2938
+ ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
2939
+ ; GFX1250-NEXT: v_mov_b32_e32 v0, v16
2943
2940
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
2944
2941
%result = mul i256 %num , %den
2945
2942
ret i256 %result
@@ -3004,7 +3001,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
3004
3001
; GFX1250: ; %bb.0:
3005
3002
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
3006
3003
; GFX1250-NEXT: s_wait_loadcnt 0x0
3007
- ; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null , 0x50, v2, 0
3004
+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
3008
3005
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
3009
3006
; GFX1250-NEXT: s_endpgm
3010
3007
%val = load i32 , ptr addrspace (1 ) %in , align 4
@@ -3195,7 +3192,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
3195
3192
; GFX1250: ; %bb.0:
3196
3193
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
3197
3194
; GFX1250-NEXT: s_wait_loadcnt 0x0
3198
- ; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null , 0x50, v2, 0
3195
+ ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
3199
3196
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
3200
3197
; GFX1250-NEXT: s_endpgm
3201
3198
%val = load i32 , ptr addrspace (1 ) %in , align 4
0 commit comments