diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1b559a628be08..8ed4062e43946 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, - Legal); + setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, + MVT::i32, Legal); setOperationAction( {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll index 7633ba0eb4f9c..66cc7f3db03c2 100644 --- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll @@ -15,7 +15,7 @@ define i16 @abs_i16(i16 %arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX6-NEXT: v_max_i32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: abs_i16: @@ -23,7 +23,7 @@ define i16 @abs_i16(i16 %arg) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v1 +; GFX7-NEXT: v_max_i32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_i16: @@ -97,9 +97,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) { ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_i32_e32 v0, v2, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -110,9 +110,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) { ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_i32_e32 v0, v2, v0 ; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX7-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -172,15 +172,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) { ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v3, v0 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 +; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v3i16: @@ -189,15 +189,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) { ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v3, v0 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 +; GFX7-NEXT: v_max_i32_e32 v1, v3, v1 +; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v3i16: @@ -262,47 +262,45 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) { ; GFX6-LABEL: v_abs_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v4, v0 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v4 +; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v4, v0 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v4 +; GFX7-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_i32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v4i16: @@ -370,63 +368,61 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) { ; GFX6-LABEL: v_abs_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v6, v0 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 +; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_max_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v6 +; GFX6-NEXT: v_max_i32_e32 v2, v6, v2 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v6 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v5, v5, v3 -; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v6i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v6, v0 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v6, v1 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 +; GFX7-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 +; GFX7-NEXT: v_max_i32_e32 v5, v6, v5 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_i32_e32 v2, v6, v2 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v6 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v3, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 -; GFX7-NEXT: v_max_i32_e32 v5, v5, v3 -; GFX7-NEXT: v_max_i32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v1, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v6i16: @@ -509,83 +505,79 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) { ; GFX6-LABEL: v_abs_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v8, v0 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v7, v7, v8 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 +; GFX6-NEXT: v_max_i32_e32 v7, v8, v7 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; GFX6-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v8i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v8, v0 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v8, v1 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 ; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v8, v4 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 ; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v6, v8 +; GFX7-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7 -; GFX7-NEXT: v_max_i32_e32 v7, v7, v8 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX7-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX7-NEXT: v_max_i32_e32 v5, v5, v7 +; GFX7-NEXT: v_max_i32_e32 v7, v8, v7 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 +; GFX7-NEXT: v_max_i32_e32 v2, v8, v2 +; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 +; GFX7-NEXT: v_max_i32_e32 v3, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v8i16: @@ -682,155 +674,147 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) { ; GFX6-LABEL: v_abs_v16i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v16, v0 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v16, v1 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v16, v4 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v5, v16, v5 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 +; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v9, v16, v9 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 ; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v12, v16, v12 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 ; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v13, v16, v13 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v14, v14, v16 +; GFX6-NEXT: v_max_i32_e32 v14, v16, v14 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15 -; GFX6-NEXT: v_max_i32_e32 v15, v15, v16 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 -; GFX6-NEXT: v_max_i32_e32 v12, v12, v15 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 -; GFX6-NEXT: v_max_i32_e32 v13, v13, v15 +; GFX6-NEXT: v_max_i32_e32 v15, v16, v15 ; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX6-NEXT: v_max_i32_e32 v11, v11, v13 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 -; GFX6-NEXT: v_max_i32_e32 v8, v8, v11 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 -; GFX6-NEXT: v_max_i32_e32 v9, v9, v11 +; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v11, v16, v11 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v11 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v7, v7, v9 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GFX6-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v7, v16, v7 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v3 +; GFX6-NEXT: v_max_i32_e32 v3, v16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v16i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v16, v0 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v16, v1 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v16, v4 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v5, v16, v5 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v8, v16, v8 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9 +; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v9, v16, v9 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12 ; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v12, v16, v12 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13 ; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v13, v16, v13 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14 -; GFX7-NEXT: v_max_i32_e32 v14, v14, v16 +; GFX7-NEXT: v_max_i32_e32 v14, v16, v14 ; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15 -; GFX7-NEXT: v_max_i32_e32 v15, v15, v16 -; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 -; GFX7-NEXT: v_max_i32_e32 v12, v12, v15 -; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 -; GFX7-NEXT: v_max_i32_e32 v13, v13, v15 +; GFX7-NEXT: v_max_i32_e32 v15, v16, v15 ; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX7-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX7-NEXT: v_max_i32_e32 v11, v11, v13 -; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 -; GFX7-NEXT: v_max_i32_e32 v8, v8, v11 -; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 -; GFX7-NEXT: v_max_i32_e32 v9, v9, v11 +; GFX7-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10 +; GFX7-NEXT: v_max_i32_e32 v10, v16, v10 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v11 +; GFX7-NEXT: v_max_i32_e32 v11, v16, v11 ; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v11 ; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX7-NEXT: v_max_i32_e32 v7, v7, v9 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX7-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX7-NEXT: v_max_i32_e32 v5, v5, v7 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GFX7-NEXT: v_max_i32_e32 v6, v16, v6 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v7 +; GFX7-NEXT: v_max_i32_e32 v7, v16, v7 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v16 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; GFX7-NEXT: v_max_i32_e32 v2, v16, v2 +; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v3 +; GFX7-NEXT: v_max_i32_e32 v3, v16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v16i16: @@ -974,303 +958,287 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX6-LABEL: v_abs_v32i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v0 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v0, v31, v0 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v1 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, v31, v1 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v4 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v4, v31, v4 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v5 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v5, v31, v5 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v8 +; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v8, v31, v8 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v9 +; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v9, v31, v9 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v12 +; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v12, v31, v12 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v13 +; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v13, v31, v13 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v16 +; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v17 +; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v17, v31, v17 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v20 +; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v20, v31, v20 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v21 +; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v21, v31, v21 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 +; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v24, v31, v24 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 ; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v25, v31, v25 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 ; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v28, v28, v31 +; GFX6-NEXT: v_max_i32_e32 v28, v31, v28 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 ; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v29, v29, v31 +; GFX6-NEXT: v_max_i32_e32 v29, v31, v29 ; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 +; GFX6-NEXT: v_max_i32_e32 v30, v31, v30 +; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v30, v30, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26 ; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v26, v26, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27 -; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v27, v27, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 -; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v24, v24, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 ; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v25, v25, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22 ; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX6-NEXT: v_max_i32_e32 v22, v22, v31 -; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23 -; GFX6-NEXT: v_max_i32_e32 v23, v23, v31 -; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX6-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX6-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX6-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20 -; GFX6-NEXT: v_max_i32_e32 v20, v20, v29 ; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16 -; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16 ; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16 ; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX6-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23 -; GFX6-NEXT: v_max_i32_e32 v23, v23, v25 -; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX6-NEXT: v_or_b32_e32 v30, v30, v23 -; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21 -; GFX6-NEXT: v_max_i32_e32 v21, v21, v23 -; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX6-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18 -; GFX6-NEXT: v_max_i32_e32 v18, v18, v21 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 -; GFX6-NEXT: v_max_i32_e32 v16, v16, v19 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 -; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX6-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14 -; GFX6-NEXT: v_max_i32_e32 v14, v14, v17 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15 -; GFX6-NEXT: v_max_i32_e32 v15, v15, v17 -; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 -; GFX6-NEXT: v_max_i32_e32 v12, v12, v15 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 -; GFX6-NEXT: v_max_i32_e32 v13, v13, v15 -; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX6-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX6-NEXT: v_max_i32_e32 v11, v11, v13 -; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 -; GFX6-NEXT: v_max_i32_e32 v8, v8, v11 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 -; GFX6-NEXT: v_max_i32_e32 v9, v9, v11 -; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX6-NEXT: v_max_i32_e32 v7, v7, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX6-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX6-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX6-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX6-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX6-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_bfe_i32 v31, v31, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v31 +; GFX6-NEXT: v_max_i32_e32 v31, v32, v31 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX6-NEXT: v_or_b32_e32 v30, v30, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v26 +; GFX6-NEXT: v_max_i32_e32 v26, v32, v26 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v27 +; GFX6-NEXT: v_max_i32_e32 v27, v32, v27 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX6-NEXT: v_or_b32_e32 v26, v26, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 +; GFX6-NEXT: v_max_i32_e32 v22, v32, v22 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v23 +; GFX6-NEXT: v_max_i32_e32 v23, v32, v23 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v18 +; GFX6-NEXT: v_max_i32_e32 v18, v32, v18 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v19 +; GFX6-NEXT: v_max_i32_e32 v19, v32, v19 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 +; GFX6-NEXT: v_max_i32_e32 v14, v32, v14 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v15 +; GFX6-NEXT: v_max_i32_e32 v15, v32, v15 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX6-NEXT: v_or_b32_e32 v14, v14, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v11, v32, v11 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 +; GFX6-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v7, v32, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v32 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 +; GFX6-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v3 +; GFX6-NEXT: v_max_i32_e32 v3, v32, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v32 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX6-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GFX6-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GFX6-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GFX6-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_abs_v32i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v0 +; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v0, v31, v0 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v1 +; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v1, v31, v1 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v4 +; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v4, v31, v4 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v5 +; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v5, v31, v5 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v8 +; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v8, v31, v8 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v9 +; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v9, v31, v9 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v12 +; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v12, v31, v12 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v13 +; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v13, v31, v13 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v16 +; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v16, v31, v16 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v17 +; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v17, v31, v17 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v20 +; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v20, v31, v20 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v21 +; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v21, v31, v21 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 +; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v24, v31, v24 +; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 ; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16 +; GFX7-NEXT: v_max_i32_e32 v25, v31, v25 ; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28 ; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v28, v28, v31 +; GFX7-NEXT: v_max_i32_e32 v28, v31, v28 ; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29 ; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v29, v29, v31 +; GFX7-NEXT: v_max_i32_e32 v29, v31, v29 ; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30 +; GFX7-NEXT: v_max_i32_e32 v30, v31, v30 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v30, v30, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26 ; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v26, v26, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27 -; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v27, v27, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24 -; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v24, v24, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25 ; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v25, v25, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22 ; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GFX7-NEXT: v_max_i32_e32 v22, v22, v31 -; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23 -; GFX7-NEXT: v_max_i32_e32 v23, v23, v31 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX7-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20 -; GFX7-NEXT: v_max_i32_e32 v20, v20, v29 ; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16 -; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16 ; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16 ; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16 -; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16 ; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX7-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16 -; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23 -; GFX7-NEXT: v_max_i32_e32 v23, v23, v25 -; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: v_or_b32_e32 v30, v30, v23 -; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21 -; GFX7-NEXT: v_max_i32_e32 v21, v21, v23 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18 -; GFX7-NEXT: v_max_i32_e32 v18, v18, v21 -; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19 -; GFX7-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX7-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16 -; GFX7-NEXT: v_max_i32_e32 v16, v16, v19 -; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17 -; GFX7-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14 -; GFX7-NEXT: v_max_i32_e32 v14, v14, v17 -; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15 -; GFX7-NEXT: v_max_i32_e32 v15, v15, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 -; GFX7-NEXT: v_max_i32_e32 v12, v12, v15 -; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 -; GFX7-NEXT: v_max_i32_e32 v13, v13, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10 -; GFX7-NEXT: v_max_i32_e32 v10, v10, v13 -; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 -; GFX7-NEXT: v_max_i32_e32 v11, v11, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8 -; GFX7-NEXT: v_max_i32_e32 v8, v8, v11 -; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 -; GFX7-NEXT: v_max_i32_e32 v9, v9, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; GFX7-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7 -; GFX7-NEXT: v_max_i32_e32 v7, v7, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 -; GFX7-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 -; GFX7-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 -; GFX7-NEXT: v_max_i32_e32 v2, v2, v5 -; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GFX7-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 -; GFX7-NEXT: v_max_i32_e32 v0, v0, v3 -; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GFX7-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX7-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX7-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v31, v31, 0, 16 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v31 +; GFX7-NEXT: v_max_i32_e32 v31, v32, v31 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v30, v30, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v26 +; GFX7-NEXT: v_max_i32_e32 v26, v32, v26 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v27 +; GFX7-NEXT: v_max_i32_e32 v27, v32, v27 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX7-NEXT: v_or_b32_e32 v26, v26, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22 +; GFX7-NEXT: v_max_i32_e32 v22, v32, v22 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v23 +; GFX7-NEXT: v_max_i32_e32 v23, v32, v23 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX7-NEXT: v_or_b32_e32 v22, v22, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v18 +; GFX7-NEXT: v_max_i32_e32 v18, v32, v18 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v19 +; GFX7-NEXT: v_max_i32_e32 v19, v32, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX7-NEXT: v_or_b32_e32 v18, v18, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14 +; GFX7-NEXT: v_max_i32_e32 v14, v32, v14 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v15 +; GFX7-NEXT: v_max_i32_e32 v15, v32, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v14, v14, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10 +; GFX7-NEXT: v_max_i32_e32 v10, v32, v10 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v11 +; GFX7-NEXT: v_max_i32_e32 v11, v32, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6 +; GFX7-NEXT: v_max_i32_e32 v6, v32, v6 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v7 +; GFX7-NEXT: v_max_i32_e32 v7, v32, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v32 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2 +; GFX7-NEXT: v_max_i32_e32 v2, v32, v2 +; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v3 +; GFX7-NEXT: v_max_i32_e32 v3, v32, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v32 +; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_abs_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index e71bf15384727..e34aaf205cb95 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -136,7 +136,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc ; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 -; GCN-NEXT: v_max_i32_e32 v1, v0, v1 +; GCN-NEXT: v_max_i32_e32 v1, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 @@ -218,7 +218,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc ; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 -; GCN-NEXT: v_max_i32_e32 v1, v0, v1 +; GCN-NEXT: v_max_i32_e32 v1, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index e27164c2d6d69..948811ea45f77 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_xor_b32 s3, s3, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s4, 0, s3 -; GFX6-NEXT: s_ashr_i32 s9, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s2, s2, s9 +; GFX6-NEXT: s_abs_i32 s8, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s4, 0, s8 +; GFX6-NEXT: s_abs_i32 s9, s2 ; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_xor_b32 s0, s2, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s0, s9, s8 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_abs_i32 s4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s6, 0, s4 +; GFX9-NEXT: s_abs_i32 s5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s8, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s7, s6 -; GFX9-NEXT: s_xor_b32 s3, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s4, s7, s6 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_xor_b32 s3, s4, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_xor_b32 s4, s3, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s3, 0, s4 -; GFX6-NEXT: s_ashr_i32 s5, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_abs_i32 s8, s2 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s6, s2, s5 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s4 -; GFX6-NEXT: s_sub_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s6, s4 -; GFX6-NEXT: s_cmp_ge_u32 s6, s4 -; GFX6-NEXT: s_cselect_b32 s4, s7, s6 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_ashr_i32 s1, s2, 31 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: @@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_abs_i32 s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6 ; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s5, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s3, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 3cf70c42390c2..d7d697ef85b9f 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -576,11 +576,11 @@ define i32 @sdiv32(i32 %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1 -; GFX9-NEXT: v_max_i32_e32 v2, v1, v2 +; GFX9-NEXT: v_max_i32_e32 v2, v2, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 ; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v5, v0, v5 +; GFX9-NEXT: v_max_i32_e32 v5, v5, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 @@ -640,11 +640,11 @@ define i32 @srem32(i32 %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1 -; GFX9-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1 ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v4, v0, v4 +; GFX9-NEXT: v_max_i32_e32 v4, v4, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 5c0f813c8c829..441509ba01f64 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -391,156 +391,144 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_abs_i32 s1, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GCN-NEXT: s_sub_i32 s6, 0, s1 -; GCN-NEXT: v_readfirstlane_b32 s8, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_lo_u32 v4, s6, v2 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_abs_i32 s7, s6 -; GCN-NEXT: s_xor_b32 s0, s6, s0 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 -; GCN-NEXT: s_ashr_i32 s6, s0, 31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s7, s0 -; GCN-NEXT: s_sub_i32 s7, s0, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: s_cselect_b32 s0, s7, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_abs_i32 s7, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s7 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: s_xor_b32 s5, s4, s8 -; GCN-NEXT: s_abs_i32 s4, s4 -; GCN-NEXT: v_mul_hi_u32 v1, v3, v4 -; GCN-NEXT: s_ashr_i32 s5, s5, 31 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_mul_i32 s6, s6, s7 -; GCN-NEXT: s_sub_i32 s4, s4, s6 -; GCN-NEXT: s_sub_i32 s6, s4, s7 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GCN-NEXT: s_cmp_ge_u32 s4, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: s_cselect_b32 s4, s6, s4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GCN-NEXT: s_cmp_ge_u32 s4, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v1, s5, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; GCN-NEXT: v_xor_b32_e32 v4, v0, v2 +; GCN-NEXT: v_xor_b32_e32 v7, v1, v3 +; GCN-NEXT: v_max_i32_e32 v2, v2, v6 +; GCN-NEXT: v_max_i32_e32 v3, v3, v9 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GCN-NEXT: v_max_i32_e32 v0, v0, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v6 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GCN-NEXT: v_mul_hi_u32 v9, v6, v9 +; GCN-NEXT: v_max_i32_e32 v1, v1, v8 +; GCN-NEXT: v_mul_hi_u32 v8, v5, v10 +; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 +; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s3, 0xf000 -; TONGA-NEXT: s_mov_b32 s2, -1 -; TONGA-NEXT: s_mov_b32 s10, s2 -; TONGA-NEXT: s_mov_b32 s11, s3 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s6 -; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_readfirstlane_b32 s0, v2 -; TONGA-NEXT: s_abs_i32 s1, s0 -; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1 -; TONGA-NEXT: s_sub_i32 s6, 0, s1 -; TONGA-NEXT: v_readfirstlane_b32 s8, v3 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 -; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2 -; TONGA-NEXT: v_readfirstlane_b32 s6, v0 -; TONGA-NEXT: s_abs_i32 s7, s6 -; TONGA-NEXT: s_xor_b32 s0, s6, s0 -; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 -; TONGA-NEXT: s_ashr_i32 s6, s0, 31 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4 -; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0 -; TONGA-NEXT: v_readfirstlane_b32 s0, v0 -; TONGA-NEXT: s_mul_i32 s0, s0, s1 -; TONGA-NEXT: s_sub_i32 s0, s7, s0 -; TONGA-NEXT: s_sub_i32 s7, s0, s1 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; TONGA-NEXT: s_cmp_ge_u32 s0, s1 -; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; TONGA-NEXT: s_cselect_b32 s0, s7, s0 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; TONGA-NEXT: s_cmp_ge_u32 s0, s1 -; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 -; TONGA-NEXT: s_abs_i32 s7, s8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7 -; TONGA-NEXT: s_mov_b32 s0, s4 -; TONGA-NEXT: s_sub_i32 s4, 0, s7 -; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0 -; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 -; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3 -; TONGA-NEXT: v_readfirstlane_b32 s4, v1 -; TONGA-NEXT: s_xor_b32 s5, s4, s8 -; TONGA-NEXT: s_abs_i32 s4, s4 -; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4 -; TONGA-NEXT: s_ashr_i32 s5, s5, 31 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 -; TONGA-NEXT: v_readfirstlane_b32 s6, v1 -; TONGA-NEXT: s_mul_i32 s6, s6, s7 -; TONGA-NEXT: s_sub_i32 s4, s4, s6 -; TONGA-NEXT: s_sub_i32 s6, s4, s7 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 -; TONGA-NEXT: s_cmp_ge_u32 s4, s7 -; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; TONGA-NEXT: s_cselect_b32 s4, s6, s4 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 -; TONGA-NEXT: s_cmp_ge_u32 s4, s7 -; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1 -; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3 +; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2 +; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v3, v3, v9 +; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 +; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 +; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 +; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3 +; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6 +; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1 +; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v8 +; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10 +; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6 +; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 +; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2 +; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7 +; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32: @@ -558,44 +546,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: s_abs_i32 s1, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_xor_b32 s0, s5, s0 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 -; GFX9-NEXT: s_abs_i32 s4, s4 -; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s0, s0, s7 ; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 ; GFX9-NEXT: s_add_i32 s7, s7, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 ; GFX9-NEXT: s_mul_i32 s7, s0, s1 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 ; GFX9-NEXT: s_add_i32 s10, s0, 1 -; GFX9-NEXT: s_sub_i32 s7, s4, s1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s1 +; GFX9-NEXT: s_sub_i32 s7, s5, s1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 ; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_add_i32 s7, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s4, s7, s0 -; GFX9-NEXT: s_abs_i32 s7, s5 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s5, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s6 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s8, s5 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: s_abs_i32 s8, s8 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s9, s9, s6 ; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9 @@ -611,10 +599,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_add_i32 s9, s6, 1 ; GFX9-NEXT: s_cmp_ge_u32 s8, s7 ; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -804,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: s_mov_b32 s6, s10 -; GCN-NEXT: s_mov_b32 s7, s11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; GCN-NEXT: s_mov_b32 s8, s0 -; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-NEXT: s_abs_i32 s13, s0 -; GCN-NEXT: s_abs_i32 s14, s1 -; GCN-NEXT: s_abs_i32 s15, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 +; GCN-NEXT: v_max_i32_e32 v4, v4, v10 +; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 +; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_max_i32_e32 v5, v5, v13 +; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 +; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 +; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GCN-NEXT: v_max_i32_e32 v0, v0, v9 +; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 +; GCN-NEXT: v_max_i32_e32 v1, v1, v12 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 +; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_max_i32_e32 v6, v6, v15 +; GCN-NEXT: v_mul_hi_u32 v12, v13, v16 +; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_mul_lo_u32 v13, v10, v4 +; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 +; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; GCN-NEXT: v_mul_lo_u32 v0, v12, v5 +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v9 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 +; GCN-NEXT: v_max_i32_e32 v5, v7, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 +; GCN-NEXT: v_mul_hi_u32 v4, v9, v4 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: s_abs_i32 s17, s6 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; GCN-NEXT: v_max_i32_e32 v2, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s3, v4 -; GCN-NEXT: v_readfirstlane_b32 s4, v5 -; GCN-NEXT: v_readfirstlane_b32 s5, v6 -; GCN-NEXT: s_xor_b32 s12, s3, s0 -; GCN-NEXT: s_xor_b32 s0, s4, s1 -; GCN-NEXT: s_xor_b32 s1, s5, s2 -; GCN-NEXT: s_sub_i32 s2, 0, s13 -; GCN-NEXT: s_ashr_i32 s18, s0, 31 -; GCN-NEXT: s_sub_i32 s0, 0, s14 -; GCN-NEXT: s_ashr_i32 s19, s1, 31 -; GCN-NEXT: s_sub_i32 s1, 0, s15 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: s_sub_i32 s20, 0, s17 -; GCN-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-NEXT: s_abs_i32 s3, s3 -; GCN-NEXT: s_abs_i32 s4, s4 -; GCN-NEXT: s_abs_i32 s5, s5 -; GCN-NEXT: v_mul_lo_u32 v7, s20, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s13 -; GCN-NEXT: v_mul_lo_u32 v6, v1, s14 -; GCN-NEXT: v_mul_lo_u32 v8, v2, s15 -; GCN-NEXT: s_abs_i32 s16, s7 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v3, s16, v3 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 -; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 -; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] -; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v3, s17 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 -; GCN-NEXT: s_ashr_i32 s12, s12, 31 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s18, v1 -; GCN-NEXT: v_xor_b32_e32 v2, s19, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: s_xor_b32 s0, s7, s6 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 -; GCN-NEXT: s_ashr_i32 s0, s0, 31 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, s0, v3 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] +; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GCN-NEXT: v_max_i32_e32 v6, v3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 +; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s11, 0xf000 -; TONGA-NEXT: s_mov_b32 s10, -1 -; TONGA-NEXT: s_mov_b32 s6, s10 -; TONGA-NEXT: s_mov_b32 s7, s11 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s4, s2 -; TONGA-NEXT: s_mov_b32 s5, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; TONGA-NEXT: s_mov_b32 s8, s0 -; TONGA-NEXT: s_mov_b32 s9, s1 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_readfirstlane_b32 s0, v0 -; TONGA-NEXT: v_readfirstlane_b32 s1, v1 -; TONGA-NEXT: v_readfirstlane_b32 s2, v2 -; TONGA-NEXT: s_abs_i32 s13, s0 -; TONGA-NEXT: s_abs_i32 s14, s1 -; TONGA-NEXT: s_abs_i32 s15, s2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13 -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14 -; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15 -; TONGA-NEXT: v_readfirstlane_b32 s6, v3 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 +; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 +; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 +; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 +; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13 +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 +; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 +; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13 +; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 +; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13 +; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10 +; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 +; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16 +; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12 +; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4 +; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5 +; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5 +; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 +; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; TONGA-NEXT: s_abs_i32 s17, s6 -; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 +; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 -; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_readfirstlane_b32 s3, v4 -; TONGA-NEXT: v_readfirstlane_b32 s4, v5 -; TONGA-NEXT: v_readfirstlane_b32 s5, v6 -; TONGA-NEXT: s_xor_b32 s12, s3, s0 -; TONGA-NEXT: s_xor_b32 s0, s4, s1 -; TONGA-NEXT: s_xor_b32 s1, s5, s2 -; TONGA-NEXT: s_sub_i32 s2, 0, s13 -; TONGA-NEXT: s_ashr_i32 s18, s0, 31 -; TONGA-NEXT: s_sub_i32 s0, 0, s14 -; TONGA-NEXT: s_ashr_i32 s19, s1, 31 -; TONGA-NEXT: s_sub_i32 s1, 0, s15 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0 -; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1 -; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2 -; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 -; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 -; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6 -; TONGA-NEXT: s_sub_i32 s20, 0, s17 -; TONGA-NEXT: v_readfirstlane_b32 s7, v7 -; TONGA-NEXT: s_abs_i32 s3, s3 -; TONGA-NEXT: s_abs_i32 s4, s4 -; TONGA-NEXT: s_abs_i32 s5, s5 -; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0 -; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 -; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2 -; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7 -; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13 -; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14 -; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15 -; TONGA-NEXT: s_abs_i32 s16, s7 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2 -; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 -; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 -; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 -; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] -; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 -; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 -; TONGA-NEXT: s_ashr_i32 s12, s12, 31 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0 -; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1 -; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1 -; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; TONGA-NEXT: s_xor_b32 s0, s7, s6 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 -; TONGA-NEXT: s_ashr_i32 s0, s0, 31 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 +; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: @@ -2006,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2014,7 +2002,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -2053,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2061,7 +2049,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 5944342b2642a..bbd179364374c 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -467,28 +467,28 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: s_abs_i32 s2, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_sub_i32 s6, 0, s2 -; GCN-NEXT: s_ashr_i32 s5, s3, 31 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: s_abs_i32 s3, s3 -; GCN-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s3, s7 +; GCN-NEXT: s_mul_hi_u32 s6, s4, s7 ; GCN-NEXT: s_mul_i32 s6, s6, s2 -; GCN-NEXT: s_sub_i32 s3, s3, s6 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s2 -; GCN-NEXT: s_cmp_ge_u32 s3, s2 -; GCN-NEXT: s_cselect_b32 s2, s6, s3 -; GCN-NEXT: s_abs_i32 s3, s4 +; GCN-NEXT: s_sub_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s6, s4, s2 +; GCN-NEXT: s_cmp_ge_u32 s4, s2 +; GCN-NEXT: s_cselect_b32 s4, s6, s4 +; GCN-NEXT: s_sub_i32 s6, s4, s2 +; GCN-NEXT: s_cmp_ge_u32 s4, s2 +; GCN-NEXT: s_cselect_b32 s2, s6, s4 +; GCN-NEXT: s_abs_i32 s3, s3 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_xor_b32 s2, s2, s5 ; GCN-NEXT: s_sub_i32 s7, 0, s3