@@ -809,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
809809; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
810810; GFX1250-NEXT: v_mov_b32_e32 v8, v1
811811; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
812- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5 ], v6, v4, v[8:9]
813- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3 ], v7, v3, v[4:5 ]
812+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11 ], v6, v4, v[8:9]
813+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5 ], v7, v3, v[10:11 ]
814814; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
815- ; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
815+ ; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
816816; GFX1250-NEXT: s_set_pc_i64 s[30:31]
817817 %result = mul i96 %num , %den
818818 ret i96 %result
@@ -1218,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12181218; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
12191219; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
12201220; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1221- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11 ], v2, v4, v[10:11]
1222- ; GFX1250-NEXT: v_mov_b32_e32 v12 , v1
1221+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13 ], v2, v4, v[10:11]
1222+ ; GFX1250-NEXT: v_mov_b32_e32 v10 , v1
12231223; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
12241224; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1225- ; GFX1250-NEXT: v_mov_b32_e32 v13, v10
1226- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13 ], vcc_lo, v8, v5, v[12:13 ]
1225+ ; GFX1250-NEXT: v_mov_b32_e32 v11, v12
1226+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15 ], vcc_lo, v8, v5, v[10:11 ]
12271227; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
12281228; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1229- ; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13 ]
1230- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11 , v8, s0
1229+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15 ]
1230+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13 , v8, s0
12311231; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12321232; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
12331233; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2874,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28742874; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
28752875; GFX1250-NEXT: s_wait_kmcnt 0x0
28762876; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
2877- ; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
2878- ; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
2879- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2877+ ; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
2878+ ; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
2879+ ; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
2880+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
28802881; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
2881- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
2882- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
2883- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2884- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
2885- ; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2886- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2887- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
2888- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2889- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
2890- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2891- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
2892- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
2893- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2894- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2895- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2882+ ; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
2883+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
2884+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0
2885+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2886+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
2887+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
2888+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2889+ ; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
2890+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
2891+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2892+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
2893+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
28962894; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2897- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1 ], v4, v10, v[0:1]
2898- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19 ], vcc_lo, v4, v8 , v[18:19 ]
2895+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19 ], v4, v10, v[0:1]
2896+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1 ], vcc_lo, v3, v9 , v[20:21 ]
28992897; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2900- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2901- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
2902- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2903- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
2904- ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
2905- ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2906- ; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
2907- ; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2898+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
2899+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
2900+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0
2901+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2902+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
2903+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
29082904; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2909- ; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
2910- ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
2911- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2905+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
2906+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
2907+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2908+ ; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
2909+ ; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
2910+ ; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2911+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
2912+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
2913+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
29122914; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
29132915; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
2914- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2915- ; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
2916- ; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2917- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
2918- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2919- ; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2916+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
2917+ ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2918+ ; GFX1250-NEXT: v_mov_b32_e32 v13, v18
2919+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2920+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
29202921; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
2922+ ; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
29212923; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
2922- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
2923- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2924- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
2925- ; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
2926- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2927- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
2928- ; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
2929- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2930- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2931- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
2932- ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
2924+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2925+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
2926+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
2927+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
2928+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2929+ ; GFX1250-NEXT: v_mov_b32_e32 v12, v1
2930+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
2931+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2932+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
2933+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
2934+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2
29332935; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
2934- ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
2935- ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
29362936; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
2937- ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2937+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
2938+ ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
29382939; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
2939- ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
2940+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
2941+ ; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
29402942; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29412943; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
29422944; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
29432945; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29442946; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
2945- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26 , v11, s2
2947+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28 , v11, s2
29462948; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2947- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23 , v2, s2
2949+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25 , v2, s2
29482950; GFX1250-NEXT: v_mov_b32_e32 v2, v15
29492951; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
29502952; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2951- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20 , s4
2952- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29 , s3
2953+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32 , s4
2954+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31 , s3
29532955; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2954- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25 , s1
2955- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27 , s0
2956+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30 , s1
2957+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29 , s0
29562958; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2957- ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22 , vcc_lo
2959+ ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24 , vcc_lo
29582960; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
29592961; GFX1250-NEXT: v_mov_b32_e32 v1, v14
29602962; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -3019,9 +3021,9 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
30193021;
30203022; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
30213023; GFX1250: ; %bb.0:
3022- ; GFX1250-NEXT: global_load_b32 v2 , v[2:3], off
3024+ ; GFX1250-NEXT: global_load_b32 v4 , v[2:3], off
30233025; GFX1250-NEXT: s_wait_loadcnt 0x0
3024- ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2 , 0
3026+ ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4 , 0
30253027; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
30263028; GFX1250-NEXT: s_endpgm
30273029 %val = load i32 , ptr addrspace (1 ) %in , align 4
@@ -3213,9 +3215,9 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
32133215;
32143216; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
32153217; GFX1250: ; %bb.0:
3216- ; GFX1250-NEXT: global_load_b32 v2 , v[2:3], off
3218+ ; GFX1250-NEXT: global_load_b32 v4 , v[2:3], off
32173219; GFX1250-NEXT: s_wait_loadcnt 0x0
3218- ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2 , 0
3220+ ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4 , 0
32193221; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
32203222; GFX1250-NEXT: s_endpgm
32213223 %val = load i32 , ptr addrspace (1 ) %in , align 4
0 commit comments