@@ -26,16 +26,16 @@ define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
2626; GFX9-LABEL: v_underflow_compare_fold_i32:
2727; GFX9: ; %bb.0:
2828; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29- ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
30- ; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
29+ ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc , v0, v1
30+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3131; GFX9-NEXT: s_setpc_b64 s[30:31]
3232;
3333; GFX11-LABEL: v_underflow_compare_fold_i32:
3434; GFX11: ; %bb.0:
3535; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36- ; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
36+ ; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo , v0, v1
3737; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
38- ; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
38+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
3939; GFX11-NEXT: s_setpc_b64 s[30:31]
4040 %sub = sub i32 %a , %b
4141 %cond = call i32 @llvm.umin.i32 (i32 %sub , i32 %a )
@@ -46,16 +46,16 @@ define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
4646; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
4747; GFX9: ; %bb.0:
4848; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49- ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
50- ; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
49+ ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc , v0, v1
50+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
5151; GFX9-NEXT: s_setpc_b64 s[30:31]
5252;
5353; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
5454; GFX11: ; %bb.0:
5555; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56- ; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
56+ ; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo , v0, v1
5757; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
58- ; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
58+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5959; GFX11-NEXT: s_setpc_b64 s[30:31]
6060 %sub = sub i32 %a , %b
6161 %cond = call i32 @llvm.umin.i32 (i32 %a , i32 %sub )
@@ -66,19 +66,20 @@ define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace
6666; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
6767; GFX9: ; %bb.0:
6868; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69- ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
70- ; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
71- ; GFX9-NEXT: global_store_dword v[2:3], v1, off
69+ ; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1
70+ ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v0, v1
71+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
72+ ; GFX9-NEXT: global_store_dword v[2:3], v4, off
7273; GFX9-NEXT: s_waitcnt vmcnt(0)
7374; GFX9-NEXT: s_setpc_b64 s[30:31]
7475;
7576; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
7677; GFX11: ; %bb.0:
7778; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78- ; GFX11-NEXT: v_sub_nc_u32_e32 v1 , v0, v1
79- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
80- ; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
81- ; GFX11-NEXT: global_store_b32 v[2:3] , v1, off
79+ ; GFX11-NEXT: v_sub_nc_u32_e32 v4 , v0, v1
80+ ; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, v0, v1
81+ ; GFX11-NEXT: global_store_b32 v[2:3], v4, off
82+ ; GFX11-NEXT: v_cndmask_b32_e32 v0 , v1, v0, vcc_lo
8283; GFX11-NEXT: s_setpc_b64 s[30:31]
8384 %sub = sub i32 %a , %b
8485 store i32 %sub , ptr addrspace (1 ) %ptr
@@ -190,15 +191,19 @@ define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #
190191define amdgpu_ps i32 @s_underflow_compare_fold_i32 (i32 inreg %a , i32 inreg %b ) #0 {
191192; GFX9-LABEL: s_underflow_compare_fold_i32:
192193; GFX9: ; %bb.0:
193- ; GFX9-NEXT: s_sub_i32 s1, s0, s1
194- ; GFX9-NEXT: s_min_u32 s0, s1, s0
194+ ; GFX9-NEXT: v_mov_b32_e32 v0, s1
195+ ; GFX9-NEXT: v_mov_b32_e32 v1, s0
196+ ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
197+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
198+ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
195199; GFX9-NEXT: ; return to shader part epilog
196200;
197201; GFX11-LABEL: s_underflow_compare_fold_i32:
198202; GFX11: ; %bb.0:
199- ; GFX11-NEXT: s_sub_i32 s1, s0, s1
200- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
201- ; GFX11-NEXT: s_min_u32 s0, s1, s0
203+ ; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, s1
204+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
205+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
206+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
202207; GFX11-NEXT: ; return to shader part epilog
203208 %sub = sub i32 %a , %b
204209 %cond = call i32 @llvm.umin.i32 (i32 %sub , i32 %a )
0 commit comments