@@ -801,15 +801,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
801801; GFX1250-NEXT: s_wait_kmcnt 0x0
802802; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
803803; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
804- ; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
805- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
806- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v6, v3, 0
807- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1 )
808- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
809- ; GFX1250-NEXT: v_dual_mov_b32 v10 , v1 :: v_dual_mov_b32 v11, v8
804+ ; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4
805+ ; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0
806+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0
807+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2 )
808+ ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
809+ ; GFX1250-NEXT: v_mov_b32_e32 v8 , v1
810810; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
811- ; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11 ]
812- ; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null , v7, v3, v[4:5]
811+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9 ]
812+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
813813; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
814814; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
815815; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -1206,11 +1206,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12061206; GFX1250-NEXT: s_wait_kmcnt 0x0
12071207; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
12081208; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1209- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v8, v6, 0
1210- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null , v9, v5, v[0:1]
1211- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null , v8, v4, 0
1209+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0
1210+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
1211+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
12121212; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1213- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null , v2, v4, v[10:11]
1213+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
12141214; GFX1250-NEXT: v_mov_b32_e32 v12, v1
12151215; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
12161216; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1220,15 +1220,13 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12201220; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12211221; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
12221222; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
1223- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1224- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
1225- ; GFX1250-NEXT: v_mov_b32_e32 v1, v6
1226- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1227- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
1223+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1224+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
1225+ ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
1226+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
12281227; GFX1250-NEXT: v_mov_b32_e32 v2, v7
1229- ; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
1230- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1231- ; GFX1250-NEXT: v_mov_b32_e32 v3, v4
1228+ ; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1
1229+ ; GFX1250-NEXT: v_mov_b32_e32 v1, v6
12321230; GFX1250-NEXT: s_set_pc_i64 s[30:31]
12331231 %result = mul i128 %num , %den
12341232 ret i128 %result
@@ -2856,90 +2854,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28562854; GFX1250: ; %bb.0:
28572855; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
28582856; GFX1250-NEXT: s_wait_kmcnt 0x0
2859- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v0, v14, 0
2860- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null , v0, v12, 0
2861- ; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
2857+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
2858+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
2859+ ; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
28622860; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
28632861; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2864- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v1, v13, v[16:17]
2862+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
28652863; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
28662864; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28672865; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2868- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v2, v12, v[16:17]
2866+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
28692867; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
28702868; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
28712869; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2872- ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null , v0, v10, 0
2870+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
28732871; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2874- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v3, v11, v[16:17]
2872+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
28752873; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
28762874; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28772875; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2878- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null , v4, v10, v[16:17]
2879- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2 )
2876+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
2877+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1 )
28802878; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2881- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
2882- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
2883- ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
2879+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2880+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2881+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
2882+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
28842883; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
2885- ; GFX1250-NEXT: v_mov_b32_e32 v20, v19
2886- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
2887- ; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
2888- ; GFX1250-NEXT: v_mov_b32_e32 v21, v22
2889- ; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
2890- ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
2891- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2892- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v27 , null, 0, v19, vcc_lo
2893- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
2894- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2895- ; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
2884+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2885+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2886+ ; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
2887+ ; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2888+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
2889+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2890+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
2891+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6 , null, 0, v6, s0
2892+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2893+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
2894+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
28962895; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2897- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
2898- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2896+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
28992897; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
29002898; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2901- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29022899; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2900+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
2901+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
29032902; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
2904- ; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
2905- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
29062903; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
29072904; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
29082905; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2906+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
29092907; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
29102908; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2911- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
29122909; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2910+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
29132911; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
29142912; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
29152913; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
29162914; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
29172915; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
29182916; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
2919- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
29202917; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2918+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
29212919; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
29222920; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29232921; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
2924- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27 , v13, s2
2922+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6 , v13, s2
29252923; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29262924; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2927- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6 , v11, s2
2928- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT ) | instid1(VALU_DEP_1 )
2925+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26 , v11, s2
2926+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
29292927; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
2928+ ; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
29302929; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
29312930; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29322931; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
29332932; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
29342933; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29352934; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
2936- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2935+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
29372936; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2938- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
2939- ; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
2940- ; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
2941- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2942- ; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
2937+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2938+ ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
2939+ ; GFX1250-NEXT: v_mov_b32_e32 v0, v16
29432940; GFX1250-NEXT: s_set_pc_i64 s[30:31]
29442941 %result = mul i256 %num , %den
29452942 ret i256 %result
@@ -3004,7 +3001,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
30043001; GFX1250: ; %bb.0:
30053002; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
30063003; GFX1250-NEXT: s_wait_loadcnt 0x0
3007- ; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null , 0x50, v2, 0
3004+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
30083005; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
30093006; GFX1250-NEXT: s_endpgm
30103007 %val = load i32 , ptr addrspace (1 ) %in , align 4
@@ -3195,7 +3192,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
31953192; GFX1250: ; %bb.0:
31963193; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
31973194; GFX1250-NEXT: s_wait_loadcnt 0x0
3198- ; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null , 0x50, v2, 0
3195+ ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
31993196; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
32003197; GFX1250-NEXT: s_endpgm
32013198 %val = load i32 , ptr addrspace (1 ) %in , align 4
0 commit comments