@@ -822,10 +822,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
822822; GFX1250-NEXT: v_mov_b32_e32 v2, 0
823823; GFX1250-NEXT: s_wait_kmcnt 0x0
824824; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
825- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2 ) | instid1(VALU_DEP_2)
825+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2)
826826; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
827827; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
828- ; GFX1250-NEXT: s_wait_alu 0xf1ff
829828; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
830829; GFX1250-NEXT: s_clause 0x1
831830; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1803,10 +1802,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18031802; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18041803; GFX1250-NEXT: s_wait_kmcnt 0x0
18051804; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
1806- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2 ) | instid1(VALU_DEP_2)
1805+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2)
18071806; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
18081807; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1809- ; GFX1250-NEXT: s_wait_alu 0xf1ff
18101808; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
18111809; GFX1250-NEXT: s_clause 0x1
18121810; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3136,126 +3134,105 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
31363134; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31373135; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000
31383136; GFX1250-NEXT: v_s_rcp_f32 s0, s0
3139- ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1 ) | instid1(SALU_CYCLE_2 )
3137+ ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT ) | instid1(SALU_CYCLE_3 )
31403138; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc
3141- ; GFX1250-NEXT: s_wait_alu 0xfffe
31423139; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000
3143- ; GFX1250-NEXT: s_wait_alu 0xfffe
3144- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3140+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31453141; GFX1250-NEXT: s_trunc_f32 s1, s1
3146- ; GFX1250-NEXT: s_wait_alu 0xfffe
31473142; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000
31483143; GFX1250-NEXT: s_cvt_u32_f32 s5, s1
31493144; GFX1250-NEXT: s_mov_b32 s1, 0
3150- ; GFX1250-NEXT: s_wait_alu 0xfffe
3145+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31513146; GFX1250-NEXT: s_cvt_u32_f32 s4, s0
3152- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31533147; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5]
3148+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31543149; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13
31553150; GFX1250-NEXT: s_mul_i32 s14, s4, s13
31563151; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12
31573152; GFX1250-NEXT: s_mul_i32 s17, s5, s12
3158- ; GFX1250-NEXT: s_wait_alu 0xfffe
31593153; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15]
31603154; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12
31613155; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13
31623156; GFX1250-NEXT: s_add_co_u32 s0, s14, s17
31633157; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16
31643158; GFX1250-NEXT: s_mul_i32 s12, s5, s13
31653159; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
3166- ; GFX1250-NEXT: s_wait_alu 0xfffe
3160+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31673161; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
3168- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31693162; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12
31703163; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
31713164; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13
31723165; GFX1250-NEXT: v_readfirstlane_b32 s4, v0
3173- ; GFX1250-NEXT: s_wait_alu 0xfffe
31743166; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5]
31753167; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31763168; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7
31773169; GFX1250-NEXT: s_mul_i32 s12, s4, s7
31783170; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6
31793171; GFX1250-NEXT: s_mul_i32 s15, s5, s6
3180- ; GFX1250-NEXT: s_wait_alu 0xfffe
31813172; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
31823173; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6
31833174; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7
3184- ; GFX1250-NEXT: s_wait_alu 0xfffe
31853175; GFX1250-NEXT: s_add_co_u32 s0, s12, s15
31863176; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14
31873177; GFX1250-NEXT: s_mul_i32 s6, s5, s7
31883178; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0
3189- ; GFX1250-NEXT: s_wait_alu 0xfffe
3179+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31903180; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7]
3191- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31923181; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6
31933182; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
31943183; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7
31953184; GFX1250-NEXT: v_readfirstlane_b32 s7, v0
3196- ; GFX1250-NEXT: s_wait_alu 0xfffe
31973185; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0
31983186; GFX1250-NEXT: s_mul_i32 s4, s10, s0
31993187; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0
32003188; GFX1250-NEXT: s_mul_i32 s6, s11, s0
32013189; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7
32023190; GFX1250-NEXT: s_mul_i32 s13, s11, s7
3203- ; GFX1250-NEXT: s_wait_alu 0xfffe
32043191; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5]
32053192; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7
3206- ; GFX1250-NEXT: s_wait_alu 0xfffe
32073193; GFX1250-NEXT: s_add_co_u32 s4, s4, s13
32083194; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0
32093195; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0
3210- ; GFX1250-NEXT: s_wait_alu 0xfffe
3196+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32113197; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7]
3212- ; GFX1250-NEXT: s_wait_alu 0xfffe
32133198; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
3214- ; GFX1250-NEXT: s_wait_alu 0xfffe
3199+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32153200; GFX1250-NEXT: s_or_b32 s6, s6, s4
3216- ; GFX1250-NEXT: s_wait_alu 0xfffe
32173201; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
32183202; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2
3219- ; GFX1250-NEXT: s_wait_alu 0xfffe
32203203; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4
32213204; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5
32223205; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
32233206; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
32243207; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2
3225- ; GFX1250-NEXT: s_wait_alu 0xfffe
32263208; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3
32273209; GFX1250-NEXT: s_cmp_lg_u32 s12, 0
32283210; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1
32293211; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
3230- ; GFX1250-NEXT: s_wait_alu 0xfffe
32313212; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0
3232- ; GFX1250-NEXT: s_wait_alu 0xfffe
3213+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32333214; GFX1250-NEXT: s_cmp_ge_u32 s4, s3
32343215; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
32353216; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
32363217; GFX1250-NEXT: s_cmp_eq_u32 s4, s3
32373218; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0
32383219; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3239- ; GFX1250-NEXT: s_wait_alu 0xfffe
32403220; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo
32413221; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
32423222; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5
3243- ; GFX1250-NEXT: s_wait_alu 0xfffe
3223+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32443224; GFX1250-NEXT: s_cmp_ge_u32 s0, s3
3245- ; GFX1250-NEXT: s_wait_alu 0xfffd
32463225; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
32473226; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
32483227; GFX1250-NEXT: s_cmp_eq_u32 s0, s3
32493228; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
32503229; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
3251- ; GFX1250-NEXT: s_wait_alu 0xfffe
3230+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
32523231; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0
3253- ; GFX1250-NEXT: s_wait_alu 0xfffd
32543232; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo
32553233; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo
3256- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
32573234; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3258- ; GFX1250-NEXT: s_wait_alu 0xfffd
3235+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
32593236; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo
32603237; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo
32613238; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
@@ -3269,31 +3246,25 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
32693246; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32703247; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
32713248; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
3272- ; GFX1250-NEXT: s_wait_alu 0xfffe
32733249; GFX1250-NEXT: s_mul_i32 s1, s1, s0
3274- ; GFX1250-NEXT: s_wait_alu 0xfffe
3250+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32753251; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1
3276- ; GFX1250-NEXT: s_wait_alu 0xfffe
32773252; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
3278- ; GFX1250-NEXT: s_wait_alu 0xfffe
3253+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32793254; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0
3280- ; GFX1250-NEXT: s_wait_alu 0xfffe
32813255; GFX1250-NEXT: s_mul_i32 s1, s0, s2
32823256; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3283- ; GFX1250-NEXT: s_wait_alu 0xfffe
32843257; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1
3285- ; GFX1250-NEXT: s_wait_alu 0xfffe
3258+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32863259; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2
32873260; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
32883261; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3289- ; GFX1250-NEXT: s_wait_alu 0xfffe
32903262; GFX1250-NEXT: s_cselect_b32 s1, s4, s1
32913263; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3292- ; GFX1250-NEXT: s_wait_alu 0xfffe
32933264; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
32943265; GFX1250-NEXT: s_mov_b32 s1, 0
32953266; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3296- ; GFX1250-NEXT: s_wait_alu 0xfffe
3267+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32973268; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
32983269; GFX1250-NEXT: .LBB16_3:
32993270; GFX1250-NEXT: v_mov_b32_e32 v2, 0
0 commit comments