@@ -565,22 +565,23 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
565565;
566566; GFX11-LABEL: srem32_invariant_denom:
567567; GFX11: ; %bb.0: ; %bb
568- ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
569- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
570- ; GFX11-NEXT: s_abs_i32 s2, s0
568+ ; GFX11-NEXT: s_clause 0x1
569+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
571570; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
571+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
572+ ; GFX11-NEXT: s_abs_i32 s2, s2
573+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
572574; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
573575; GFX11-NEXT: s_sub_i32 s3, 0, s2
574- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
575576; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
576577; GFX11-NEXT: s_waitcnt_depctr 0xfff
577578; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
579+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
578580; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
579- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
580581; GFX11-NEXT: v_readfirstlane_b32 s4, v0
581582; GFX11-NEXT: v_mov_b32_e32 v0, 0
583+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
582584; GFX11-NEXT: s_mul_i32 s3, s3, s4
583- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
584585; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
585586; GFX11-NEXT: s_mov_b32 s3, 0
586587; GFX11-NEXT: s_add_i32 s4, s4, s5
@@ -601,7 +602,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
601602; GFX11-NEXT: s_cselect_b32 s5, s6, s5
602603; GFX11-NEXT: s_add_i32 s3, s3, 1
603604; GFX11-NEXT: v_mov_b32_e32 v1, s5
604- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
605605; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
606606; GFX11-NEXT: s_add_u32 s0, s0, 4
607607; GFX11-NEXT: s_addc_u32 s1, s1, 0
@@ -694,31 +694,32 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
694694; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
695695; GFX11-NEXT: s_waitcnt lgkmcnt(0)
696696; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
697- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
697+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
698698; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
699699; GFX11-NEXT: s_mov_b32 s2, 0
700- ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
701700; GFX11-NEXT: .p2align 6
702701; GFX11-NEXT: .LBB4_1: ; %bb3
703702; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
703+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
704+ ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
704705; GFX11-NEXT: s_and_b32 s3, 0xffff, s2
705706; GFX11-NEXT: s_add_i32 s2, s2, 1
706707; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
707708; GFX11-NEXT: s_lshl_b32 s3, s3, 1
708- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
709- ; GFX11-NEXT: v_mov_b32_e32 v4, s3
709+ ; GFX11-NEXT: v_mov_b32_e32 v3, s3
710710; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
711- ; GFX11-NEXT: s_waitcnt_depctr 0xfff
712- ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
711+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
713712; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
714- ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
713+ ; GFX11-NEXT: s_waitcnt_depctr 0xfff
714+ ; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
715+ ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
715716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
716- ; GFX11-NEXT: v_fma_f32 v2, -v3 , v0, v2
717- ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
717+ ; GFX11-NEXT: v_fma_f32 v2, -v1 , v0, v2
718+ ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
718719; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
719720; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
720- ; GFX11-NEXT: v_add_co_ci_u32_e32 v2 , vcc_lo, 0, v3 , vcc_lo
721- ; GFX11-NEXT: global_store_b16 v4, v2 , s[0:1]
721+ ; GFX11-NEXT: v_add_co_ci_u32_e32 v1 , vcc_lo, 0, v1 , vcc_lo
722+ ; GFX11-NEXT: global_store_b16 v3, v1 , s[0:1]
722723; GFX11-NEXT: s_cbranch_scc0 .LBB4_1
723724; GFX11-NEXT: ; %bb.2: ; %bb2
724725; GFX11-NEXT: s_endpgm
@@ -812,33 +813,34 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
812813; GFX11-NEXT: s_mov_b32 s3, 0
813814; GFX11-NEXT: s_waitcnt lgkmcnt(0)
814815; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
815- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
816+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
816817; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
817- ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
818818; GFX11-NEXT: .p2align 6
819819; GFX11-NEXT: .LBB5_1: ; %bb3
820820; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
821+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
822+ ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
821823; GFX11-NEXT: s_and_b32 s4, 0xffff, s3
822824; GFX11-NEXT: s_add_i32 s3, s3, 1
823825; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s4
824826; GFX11-NEXT: s_lshl_b32 s5, s4, 1
825827; GFX11-NEXT: s_waitcnt_depctr 0xfff
826- ; GFX11-NEXT: v_mul_f32_e32 v3 , v2, v1
828+ ; GFX11-NEXT: v_mul_f32_e32 v1 , v2, v1
827829; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
828- ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
829- ; GFX11-NEXT: v_fma_f32 v2, -v3 , v0, v2
830- ; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
831- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT ) | instid1(VALU_DEP_2 )
830+ ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
831+ ; GFX11-NEXT: v_fma_f32 v2, -v1 , v0, v2
832+ ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
833+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1 ) | instid1(VALU_DEP_3 )
832834; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
833- ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
834- ; GFX11-NEXT: v_mov_b32_e32 v3, s5
835- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(NEXT) | instid1(VALU_DEP_1)
836- ; GFX11-NEXT: v_mul_lo_u32 v2, v2 , s2
837- ; GFX11-NEXT: v_sub_nc_u32_e32 v2 , s4, v2
835+ ; GFX11-NEXT: v_mov_b32_e32 v2, s5
836+ ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
837+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(NEXT) | instid1(VALU_DEP_1)
838+ ; GFX11-NEXT: v_mul_lo_u32 v1, v1 , s2
839+ ; GFX11-NEXT: v_sub_nc_u32_e32 v1 , s4, v1
838840; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
839841; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
840842; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
841- ; GFX11-NEXT: global_store_b16 v3, v2 , s[0:1]
843+ ; GFX11-NEXT: global_store_b16 v2, v1 , s[0:1]
842844; GFX11-NEXT: s_cbranch_scc0 .LBB5_1
843845; GFX11-NEXT: ; %bb.2: ; %bb2
844846; GFX11-NEXT: s_endpgm
@@ -940,38 +942,37 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
940942; GFX11-NEXT: s_mov_b32 s3, 0
941943; GFX11-NEXT: s_waitcnt lgkmcnt(0)
942944; GFX11-NEXT: s_sext_i32_i16 s2, s2
943- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
945+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
944946; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
945- ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
946947; GFX11-NEXT: .p2align 6
947948; GFX11-NEXT: .LBB6_1: ; %bb3
948949; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
950+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
951+ ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
949952; GFX11-NEXT: s_sext_i32_i16 s4, s3
950- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
951953; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4
952954; GFX11-NEXT: s_xor_b32 s4, s4, s2
955+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
953956; GFX11-NEXT: s_ashr_i32 s4, s4, 30
954- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
955957; GFX11-NEXT: s_or_b32 s4, s4, 1
956958; GFX11-NEXT: s_waitcnt_depctr 0xfff
957- ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
958- ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
959+ ; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
959960; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
960- ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
961+ ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
962+ ; GFX11-NEXT: v_fma_f32 v2, -v1, v0, v2
963+ ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
964+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
961965; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0|
962- ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
963- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
964966; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
965967; GFX11-NEXT: s_cselect_b32 s4, s4, 0
966968; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
967- ; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
968- ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
969969; GFX11-NEXT: s_add_i32 s3, s3, 1
970- ; GFX11-NEXT: v_mov_b32_e32 v3, s5
970+ ; GFX11-NEXT: s_lshl_b32 s5, s5, 1
971+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
972+ ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_add_nc_u32 v1, s4, v1
971973; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
972- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
973974; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
974- ; GFX11-NEXT: global_store_b16 v3, v2 , s[0:1]
975+ ; GFX11-NEXT: global_store_b16 v2, v1 , s[0:1]
975976; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
976977; GFX11-NEXT: ; %bb.2: ; %bb2
977978; GFX11-NEXT: s_endpgm
@@ -1077,42 +1078,42 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
10771078; GFX11-NEXT: s_mov_b32 s3, 0
10781079; GFX11-NEXT: s_waitcnt lgkmcnt(0)
10791080; GFX11-NEXT: s_sext_i32_i16 s2, s2
1080- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1081+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10811082; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
1082- ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
10831083; GFX11-NEXT: .p2align 6
10841084; GFX11-NEXT: .LBB7_1: ; %bb3
10851085; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1086+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1087+ ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
10861088; GFX11-NEXT: s_sext_i32_i16 s4, s3
1087- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
10881089; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4
10891090; GFX11-NEXT: s_xor_b32 s5, s4, s2
1091+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
10901092; GFX11-NEXT: s_ashr_i32 s5, s5, 30
1091- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
10921093; GFX11-NEXT: s_or_b32 s5, s5, 1
10931094; GFX11-NEXT: s_waitcnt_depctr 0xfff
1094- ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
1095- ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
1095+ ; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
10961096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1097- ; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
1097+ ; GFX11-NEXT: v_trunc_f32_e32 v1, v1
1098+ ; GFX11-NEXT: v_fma_f32 v2, -v1, v0, v2
1099+ ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
1100+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10981101; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
1099- ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
1100- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
11011102; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
11021103; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1103- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instid1(SALU_CYCLE_1)
1104- ; GFX11-NEXT: v_add_nc_u32_e32 v2 , s5, v2
1104+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instid1(SALU_CYCLE_1)
1105+ ; GFX11-NEXT: v_add_nc_u32_e32 v1 , s5, v1
11051106; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
11061107; GFX11-NEXT: s_add_i32 s3, s3, 1
11071108; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1108- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
1109- ; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
1110- ; GFX11-NEXT: v_mov_b32_e32 v3, s5
1111- ; GFX11-NEXT: v_sub_nc_u32_e32 v2 , s4, v2
1109+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1 ) | instskip(SKIP_1) | instid1(VALU_DEP_1 )
1110+ ; GFX11-NEXT: v_mov_b32_e32 v2, s5
1111+ ; GFX11-NEXT: v_mul_lo_u32 v1, v1, s2
1112+ ; GFX11-NEXT: v_sub_nc_u32_e32 v1 , s4, v1
11121113; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
11131114; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11141115; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
1115- ; GFX11-NEXT: global_store_b16 v3, v2 , s[0:1]
1116+ ; GFX11-NEXT: global_store_b16 v2, v1 , s[0:1]
11161117; GFX11-NEXT: s_cbranch_scc0 .LBB7_1
11171118; GFX11-NEXT: ; %bb.2: ; %bb2
11181119; GFX11-NEXT: s_endpgm
0 commit comments