From d4ac937b79a628dd2d962439d9b0af9b205319ac Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 2 Jul 2025 04:39:04 -0500 Subject: [PATCH 01/29] Add new test for source modifiers on select --- .../AMDGPU/integer-select-source-modifiers.ll | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll new file mode 100644 index 0000000000000..6e7ff16b74139 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + +define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { + %neg.a = xor i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { + %neg.a = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { + %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { + %neg.a = or i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { + %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} From c89274e65b68073b59089df7e861e7fb4a7979e7 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 2 Jul 2025 04:40:36 -0500 Subject: [PATCH 02/29] Populate check-lines before patching --- .../AMDGPU/integer-select-source-modifiers.ll | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index 6e7ff16b74139..dd6cf9bc6c592 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -5,6 +5,22 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i32 %a, u0x80000000 %cmp = icmp eq i32 %cond, zeroinitializer %select = select i1 %cmp, i32 %neg.a, i32 %b @@ -12,6 +28,28 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_select_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) %cmp = icmp eq <2 x i32> %cond, zeroinitializer %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b @@ -19,6 +57,22 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) } define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fabs_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i32 %a, u0x7fffffff %cmp = icmp eq i32 %cond, zeroinitializer %select = select i1 %cmp, i32 %neg.a, i32 %b @@ -26,6 +80,28 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fabs_select_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff) %cmp = icmp eq <2 x i32> %cond, zeroinitializer %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b @@ -33,6 +109,22 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) } define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_fabs_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i32 %a, u0x80000000 %cmp = icmp eq i32 %cond, zeroinitializer %select = select i1 %cmp, i32 %neg.a, i32 %b @@ -40,6 +132,28 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_fabs_select_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 0x80000000, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v2, 0x80000000, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) %cmp = icmp eq <2 x i32> %cond, zeroinitializer %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b @@ -47,6 +161,23 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32 } define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_select_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer %select = select i1 %cmp, i64 %neg.a, i64 %b @@ -54,6 +185,23 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { } define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fabs_select_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i64 %a, u0x7fffffffffffffff %cmp = icmp eq i64 %cond, zeroinitializer %select = select i1 %cmp, i64 %neg.a, i64 %b @@ -61,8 +209,30 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { } define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_fabs_select_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer %select = select i1 %cmp, i64 %neg.a, i64 %b ret i64 %select } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} +; GFX7: {{.*}} +; GFX9: {{.*}} From b6b37265d283ee0d6a301280e8c34a68283375ac Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 10:12:45 -0500 Subject: [PATCH 03/29] [AMDGPU][SDAG] Support source modifiers as integer on select Extend the DAGCombine() for select to directly support fneg and fabs for i32, v2i32 and i64. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 77 ++++++++++++++++++- .../AMDGPU/integer-select-source-modifiers.ll | 40 +++------- 2 files changed, 86 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e64d2162441ab..4a719d8e145f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4842,6 +4842,64 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +static EVT IntToFloatVT(EVT VT) { + return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT( + VT.getScalarSizeInBits()), + VT.getVectorNumElements()) + : MVT::getFloatingPointVT(VT.getFixedSizeInBits()); +} + +static SDValue BitwiseToSrcModifierOp(SDValue N, + TargetLowering::DAGCombinerInfo &DCI) { + + unsigned Opc = N.getNode()->getOpcode(); + if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N.getNode()->getOperand(0); + SDValue RHS = N.getNode()->getOperand(1); + ConstantSDNode *CRHS = isConstOrConstSplat(RHS); + + if (!CRHS) + return SDValue(); + + EVT VT = RHS.getValueType(); + + assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) && + "Expected i32, v2i32 or i64 value type."); + + uint64_t Mask = 0; + if (VT.isVector()) { + SDValue Splat = DAG.getSplatValue(RHS); + const ConstantSDNode *C = dyn_cast(Splat); + Mask = C->getZExtValue(); + } else + Mask = CRHS->getZExtValue(); + + EVT FVT = IntToFloatVT(VT); + SDValue BC = DAG.getNode(ISD::BITCAST, SDLoc(N), FVT, LHS); + + switch (Opc) { + case ISD::XOR: + if (Mask == 0x80000000u || Mask == 0x8000000000000000u) + return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); + return SDValue(); + case ISD::OR: + if (Mask == 0x80000000u || Mask == 0x8000000000000000u) { + SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); + return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs); + } + return SDValue(); + case ISD::AND: + if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu) + return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC); + return SDValue(); + default: + return SDValue(); + } +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) @@ -4876,12 +4934,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { - SDValue MinMax - = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + SDValue MinMax = + combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); // Revisit this node so we can catch min3/max3/med3 patterns. - //DCI.AddToWorklist(MinMax.getNode()); + // DCI.AddToWorklist(MinMax.getNode()); return MinMax; } + + // Support source modifiers as integer. + if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { + SDLoc SL(N); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) { + SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, VT, RHS); + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, SrcMod, FRHS); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect); + return BC; + } + } } // There's no reason to not do this if the condition has other uses. diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index dd6cf9bc6c592..2db20e672c303 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -8,18 +8,15 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-LABEL: fneg_select_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i32 %a, u0x80000000 %cmp = icmp eq i32 %cond, zeroinitializer @@ -31,24 +28,19 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GCN-LABEL: fneg_select_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) %cmp = icmp eq <2 x i32> %cond, zeroinitializer @@ -60,18 +52,15 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-LABEL: fabs_select_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i32 %a, u0x7fffffff %cmp = icmp eq i32 %cond, zeroinitializer @@ -83,24 +72,19 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GCN-LABEL: fabs_select_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff) %cmp = icmp eq <2 x i32> %cond, zeroinitializer From dea39d166b9b50e053d3e6ceeccc7fcdb304b13a Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 10:32:27 -0500 Subject: [PATCH 04/29] Simplify switch in BitwiseToSrcModifierOp() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 4a719d8e145f4..e9e4c2c35f1e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4884,20 +4884,21 @@ static SDValue BitwiseToSrcModifierOp(SDValue N, case ISD::XOR: if (Mask == 0x80000000u || Mask == 0x8000000000000000u) return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); - return SDValue(); + break; case ISD::OR: if (Mask == 0x80000000u || Mask == 0x8000000000000000u) { SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs); } - return SDValue(); + break; case ISD::AND: if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu) return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC); - return SDValue(); + break; default: return SDValue(); } + return SDValue(); } SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, From 004dc9fb25514bcd37756148b4ceeb04133f9e3e Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 10:46:53 -0500 Subject: [PATCH 05/29] [NFC] Correct typo in BitwiseToSrcModifierOp() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e9e4c2c35f1e4..f13a1e12f1a7d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4887,8 +4887,8 @@ static SDValue BitwiseToSrcModifierOp(SDValue N, break; case ISD::OR: if (Mask == 0x80000000u || Mask == 0x8000000000000000u) { - SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); - return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs); + SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); + return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Neg); } break; case ISD::AND: From b27ce624f2e3d9f47a6fdd23fb75e34e8c21e10b Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 11:11:31 -0500 Subject: [PATCH 06/29] Fix bitcast type in performSelectCombine() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f13a1e12f1a7d..2c89fe9e12a7e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4943,13 +4943,15 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } // Support source modifiers as integer. + // (select c, (xor/or/and x, c), y) -> (bitcast (select c))) if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { SDLoc SL(N); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) { - SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, VT, RHS); - SDValue FSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, SrcMod, FRHS); + EVT FVT = IntToFloatVT(VT); + SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, RHS); + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS); SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect); return BC; } From f503034123bbaa95523b6c607ce35f53d440482f Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 11:31:26 -0500 Subject: [PATCH 07/29] Respond to first review comments --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2c89fe9e12a7e..86eb6cf622fa3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4857,8 +4857,8 @@ static SDValue BitwiseToSrcModifierOp(SDValue N, return SDValue(); SelectionDAG &DAG = DCI.DAG; - SDValue LHS = N.getNode()->getOperand(0); - SDValue RHS = N.getNode()->getOperand(1); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); ConstantSDNode *CRHS = isConstOrConstSplat(RHS); if (!CRHS) @@ -4869,31 +4869,25 @@ static SDValue BitwiseToSrcModifierOp(SDValue N, assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) && "Expected i32, v2i32 or i64 value type."); - uint64_t Mask = 0; - if (VT.isVector()) { - SDValue Splat = DAG.getSplatValue(RHS); - const ConstantSDNode *C = dyn_cast(Splat); - Mask = C->getZExtValue(); - } else - Mask = CRHS->getZExtValue(); - + uint64_t Mask = CRHS->getZExtValue(); EVT FVT = IntToFloatVT(VT); - SDValue BC = DAG.getNode(ISD::BITCAST, SDLoc(N), FVT, LHS); + SDLoc SL = SDLoc(N); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS); switch (Opc) { case ISD::XOR: if (Mask == 0x80000000u || Mask == 0x8000000000000000u) - return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); + return DAG.getNode(ISD::FNEG, SL, FVT, BC); break; case ISD::OR: if (Mask == 0x80000000u || Mask == 0x8000000000000000u) { SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); - return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Neg); + return DAG.getNode(ISD::FABS, SL, FVT, Neg); } break; case ISD::AND: if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu) - return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC); + return DAG.getNode(ISD::FABS, SL, FVT, BC); break; default: return SDValue(); @@ -4945,12 +4939,10 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, // Support source modifiers as integer. // (select c, (xor/or/and x, c), y) -> (bitcast (select c))) if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { - SDLoc SL(N); - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) { + if (SDValue SrcMod = BitwiseToSrcModifierOp(True, DCI)) { + SDLoc SL(N); EVT FVT = IntToFloatVT(VT); - SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, RHS); + SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False); SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS); SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect); return BC; From e073552b23bd1c4af4e740254a777335613a9603 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 11:38:29 -0500 Subject: [PATCH 08/29] Respond to secon review comments - rename function and correct test --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 12 +- .../AMDGPU/integer-select-source-modifiers.ll | 168 +++++++++++++++++- 2 files changed, 165 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 86eb6cf622fa3..bd7979f5f17e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4842,15 +4842,15 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -static EVT IntToFloatVT(EVT VT) { +static EVT getFloatVT(EVT VT) { return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT( VT.getScalarSizeInBits()), VT.getVectorNumElements()) : MVT::getFloatingPointVT(VT.getFixedSizeInBits()); } -static SDValue BitwiseToSrcModifierOp(SDValue N, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue getBitwiseToSrcModifierOp(SDValue N, + TargetLowering::DAGCombinerInfo &DCI) { unsigned Opc = N.getNode()->getOpcode(); if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND) @@ -4870,7 +4870,7 @@ static SDValue BitwiseToSrcModifierOp(SDValue N, "Expected i32, v2i32 or i64 value type."); uint64_t Mask = CRHS->getZExtValue(); - EVT FVT = IntToFloatVT(VT); + EVT FVT = getFloatVT(VT); SDLoc SL = SDLoc(N); SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS); @@ -4939,9 +4939,9 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, // Support source modifiers as integer. // (select c, (xor/or/and x, c), y) -> (bitcast (select c))) if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { - if (SDValue SrcMod = BitwiseToSrcModifierOp(True, DCI)) { + if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) { SDLoc SL(N); - EVT FVT = IntToFloatVT(VT); + EVT FVT = getFloatVT(VT); SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False); SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS); SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect); diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index 2db20e672c303..8e1905475f628 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-LABEL: fneg_select_i32: @@ -12,6 +11,20 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_select_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -34,6 +47,24 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_select_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -56,6 +87,20 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fabs_select_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fabs_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -78,6 +123,24 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fabs_select_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fabs_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -101,6 +164,22 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_fabs_select_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_fabs_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -127,6 +206,28 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_fabs_select_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v2, 0x80000000, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v2, 0x80000000, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_fabs_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -154,6 +255,24 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_select_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -178,6 +297,24 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fabs_select_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fabs_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -202,6 +339,24 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX7-LABEL: fneg_fabs_select_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: fneg_fabs_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -215,8 +370,3 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { %select = select i1 %cmp, i64 %neg.a, i64 %b ret i64 %select } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-FAKE16: {{.*}} -; GFX11-TRUE16: {{.*}} -; GFX7: {{.*}} -; GFX9: {{.*}} From 000ddc8efbedfe9cec22bbcfb6f57538f47d71e6 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 11:57:18 -0500 Subject: [PATCH 09/29] [NFC] Remove incomplete dag-style comment --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bd7979f5f17e8..b159746109dbf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4937,7 +4937,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } // Support source modifiers as integer. - // (select c, (xor/or/and x, c), y) -> (bitcast (select c))) if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) { SDLoc SL(N); From 2604329f86229792e85fa456c288629c56328b76 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 7 Jul 2025 12:58:02 -0500 Subject: [PATCH 10/29] Make test for bitwise src mods more stringent and correct fneg-fabs order --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b159746109dbf..ecb535de7e33f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4876,17 +4876,20 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N, switch (Opc) { case ISD::XOR: - if (Mask == 0x80000000u || Mask == 0x8000000000000000u) + if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) || + (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) return DAG.getNode(ISD::FNEG, SL, FVT, BC); break; case ISD::OR: - if (Mask == 0x80000000u || Mask == 0x8000000000000000u) { - SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC); - return DAG.getNode(ISD::FABS, SL, FVT, Neg); + if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) || + (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) { + SDValue Abs = DAG.getNode(ISD::ABS, SDLoc(N), FVT, BC); + return DAG.getNode(ISD::FNEG, SL, FVT, Abs); } break; case ISD::AND: - if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu) + if ((Mask == 0x7fffffffu && VT.getFixedSizeInBits() == 32) || + (Mask == 0x7fffffffffffffffu && VT.getFixedSizeInBits() == 64)) return DAG.getNode(ISD::FABS, SL, FVT, BC); break; default: From 97d93d620759cee2e710594c0b35b94e2eb37750 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 8 Jul 2025 06:33:32 -0500 Subject: [PATCH 11/29] Reviewer corrections --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ecb535de7e33f..52df6171c37a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4843,10 +4843,10 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, } static EVT getFloatVT(EVT VT) { - return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT( - VT.getScalarSizeInBits()), - VT.getVectorNumElements()) - : MVT::getFloatingPointVT(VT.getFixedSizeInBits()); + return VT.isVector() ? MVT::getVectorVT( + MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()) + : MVT::getFloatingPointVT(VT.getFixedSizeInBits()); } static SDValue getBitwiseToSrcModifierOp(SDValue N, @@ -4883,7 +4883,7 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N, case ISD::OR: if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) || (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) { - SDValue Abs = DAG.getNode(ISD::ABS, SDLoc(N), FVT, BC); + SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC); return DAG.getNode(ISD::FNEG, SL, FVT, Abs); } break; From f255ddcaf7f8724c31f0ac177b2cb0b1b3b685a2 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 8 Jul 2025 10:08:50 -0500 Subject: [PATCH 12/29] Refactor to support the source modifiers on either or both operands. Also extend the test. Still struggling with 64-bit though as the legalizer is splitting some 64-bit ops into v2i32. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 39 +- .../AMDGPU/integer-select-source-modifiers.ll | 589 ++++++++++++++---- 2 files changed, 471 insertions(+), 157 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 52df6171c37a0..3f88c949fe96a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4843,17 +4843,15 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, } static EVT getFloatVT(EVT VT) { - return VT.isVector() ? MVT::getVectorVT( - MVT::getFloatingPointVT(VT.getScalarSizeInBits()), - VT.getVectorNumElements()) - : MVT::getFloatingPointVT(VT.getFixedSizeInBits()); + EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits()); + return VT.isVector() ? VT.changeVectorElementType(FT) : FT; } static SDValue getBitwiseToSrcModifierOp(SDValue N, TargetLowering::DAGCombinerInfo &DCI) { unsigned Opc = N.getNode()->getOpcode(); - if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND) + if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -4865,31 +4863,23 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N, return SDValue(); EVT VT = RHS.getValueType(); - - assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) && - "Expected i32, v2i32 or i64 value type."); - - uint64_t Mask = CRHS->getZExtValue(); EVT FVT = getFloatVT(VT); SDLoc SL = SDLoc(N); SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS); switch (Opc) { case ISD::XOR: - if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) || - (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) + if (CRHS->getAPIntValue().isSignMask()) return DAG.getNode(ISD::FNEG, SL, FVT, BC); break; case ISD::OR: - if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) || - (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) { + if (CRHS->getAPIntValue().isSignMask()) { SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC); return DAG.getNode(ISD::FNEG, SL, FVT, Abs); } break; case ISD::AND: - if ((Mask == 0x7fffffffu && VT.getFixedSizeInBits() == 32) || - (Mask == 0x7fffffffffffffffu && VT.getFixedSizeInBits() == 64)) + if (CRHS->getAPIntValue().isMaxSignedValue()) return DAG.getNode(ISD::FABS, SL, FVT, BC); break; default: @@ -4939,15 +4929,20 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return MinMax; } - // Support source modifiers as integer. + // Support source modifiers on integer types. if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { - if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) { + SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI); + SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI); + if (SrcModTrue || SrcModFalse) { SDLoc SL(N); EVT FVT = getFloatVT(VT); - SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False); - SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS); - SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect); - return BC; + SDValue FLHS = + SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True); + SDValue FRHS = SrcModFalse ? SrcModFalse + : DAG.getNode(ISD::BITCAST, SL, FVT, False); + ; + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS); + return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); } } } diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index 8e1905475f628..4fc31493a05f9 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -1,31 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s -define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { -; GCN-LABEL: fneg_select_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fneg_select_i32: +define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_select_i32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_select_i32: +; GFX9-LABEL: fneg_select_i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_select_i32: +; GFX11-LABEL: fneg_select_i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -37,17 +30,91 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) { ret i32 %select } -define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GCN-LABEL: fneg_select_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] +define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_select_i32_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: fneg_select_v2i32: +; GFX9-LABEL: fneg_select_i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_select_i32_both: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i32_both: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32_both: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %neg.b = xor i32 %b, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %neg.b + ret i32 %select +} + +define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_1_fabs_2_select_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_1_fabs_2_select_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_1_fabs_2_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %abs.b = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %abs.b + ret i32 %select +} + +define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fneg_select_v2i32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -56,7 +123,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_select_v2i32: +; GFX9-LABEL: fneg_select_v2i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -65,7 +132,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_select_v2i32: +; GFX11-LABEL: fneg_select_v2i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -79,29 +146,55 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ret <2 x i32> %select } -define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { -; GCN-LABEL: fabs_select_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fabs_select_i32: +define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fneg_select_v2i32_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_v2i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fabs_select_i32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fabs_select_i32: +; GFX9-LABEL: fabs_select_i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fabs_select_i32: +; GFX11-LABEL: fabs_select_i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -113,17 +206,35 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) { ret i32 %select } -define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GCN-LABEL: fabs_select_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fabs_select_v2i32: +define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fabs_select_i32_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fabs_select_v2i32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -132,7 +243,7 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fabs_select_v2i32: +; GFX9-LABEL: fabs_select_v2i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -141,7 +252,7 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fabs_select_v2i32: +; GFX11-LABEL: fabs_select_v2i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -155,38 +266,93 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ret <2 x i32> %select } -define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { -; GCN-LABEL: fneg_fabs_select_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fneg_fabs_select_i32: +define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fabs_select_v2i32_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_v2i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fneg_select_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %abs.b = and <2 x i32> %a, splat (i32 u0x7fffffff) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %abs.b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_fabs_select_i32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v1, 0x80000000, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_fabs_select_i32: +; GFX9-LABEL: fneg_fabs_select_i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_fabs_select_i32: +; GFX11-LABEL: fneg_fabs_select_i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i32 %a, u0x80000000 %cmp = icmp eq i32 %cond, zeroinitializer @@ -194,50 +360,59 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) { ret i32 %select } -define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GCN-LABEL: fneg_fabs_select_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, 0x80000000, v2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fneg_fabs_select_v2i32: +define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GFX7-LABEL: fneg_fabs_select_i32_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v2, 0x80000000, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fneg_fabs_select_v2i32_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_fabs_select_v2i32: +; GFX9-LABEL: fneg_fabs_select_v2i32_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80000000, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_fabs_select_v2i32: +; GFX11-LABEL: fneg_fabs_select_v2i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80000000, v2 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) %cmp = icmp eq <2 x i32> %cond, zeroinitializer @@ -245,17 +420,41 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32 ret <2 x i32> %select } -define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { -; GCN-LABEL: fneg_select_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fneg_select_i64: +define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GFX7-LABEL: fneg_fabs_select_v2i32_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_v2i32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fneg_select_i64_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -264,7 +463,7 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_select_i64: +; GFX9-LABEL: fneg_select_i64_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -273,7 +472,7 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_select_i64: +; GFX11-LABEL: fneg_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -287,17 +486,78 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) { ret i64 %select } -define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { -; GCN-LABEL: fabs_select_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fabs_select_i64: +define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fneg_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fneg_1_fabs_2_select_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_1_fabs_2_select_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_1_fabs_2_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %abs.b = and i64 %b, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %abs.b + ret i64 %select +} + +define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fabs_select_i64_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -306,7 +566,7 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fabs_select_i64: +; GFX9-LABEL: fabs_select_i64_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -315,7 +575,7 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fabs_select_i64: +; GFX11-LABEL: fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -329,17 +589,41 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ret i64 %select } -define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { -; GCN-LABEL: fneg_fabs_select_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: fneg_fabs_select_i64: +define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fabs_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fabs_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fneg_fabs_select_i64_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -348,7 +632,7 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: fneg_fabs_select_i64: +; GFX9-LABEL: fneg_fabs_select_i64_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] @@ -357,7 +641,7 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_fabs_select_i64: +; GFX11-LABEL: fneg_fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -370,3 +654,38 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) { %select = select i1 %cmp, i64 %neg.a, i64 %b ret i64 %select } + +define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GFX7-LABEL: fneg_fabs_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_fabs_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} From 2e2249aa617db150369cabec03ad467841a868b6 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 8 Jul 2025 10:17:38 -0500 Subject: [PATCH 13/29] Fix Typo. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3f88c949fe96a..7916f8203c390 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4940,7 +4940,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True); SDValue FRHS = SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, False); - ; SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS); return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); } From a505a7204fdf9754228b36c454c9ed16ad5ed1e1 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 8 Jul 2025 11:17:25 -0500 Subject: [PATCH 14/29] Respond to reviewer - Add i16 tests, simplify obtaining type --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +- .../AMDGPU/integer-select-source-modifiers.ll | 685 +++++++++--------- 2 files changed, 338 insertions(+), 350 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7916f8203c390..e6b611eda3a10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4935,7 +4935,8 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI); if (SrcModTrue || SrcModFalse) { SDLoc SL(N); - EVT FVT = getFloatVT(VT); + EVT FVT = + SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); SDValue FLHS = SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True); SDValue FRHS = SrcModFalse ? SrcModFalse diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index 4fc31493a05f9..eed83dd905c38 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -1,22 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_select_i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i32_1: ; GFX11: ; %bb.0: @@ -31,19 +25,12 @@ define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) { } define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_select_i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i32_2: ; GFX11: ; %bb.0: @@ -58,19 +45,12 @@ define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) { } define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_select_i32_both: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_i32_both: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_i32_both: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i32_both: ; GFX11: ; %bb.0: @@ -86,19 +66,12 @@ define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) { } define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_1_fabs_2_select_i32: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_1_fabs_2_select_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_1_fabs_2_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_1_fabs_2_select_i32: ; GFX11: ; %bb.0: @@ -114,23 +87,14 @@ define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fneg_select_v2i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_v2i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_v2i32_1: ; GFX11: ; %bb.0: @@ -147,23 +111,14 @@ define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> % } define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fneg_select_v2i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_v2i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_v2i32_2: ; GFX11: ; %bb.0: @@ -180,19 +135,12 @@ define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> % } define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fabs_select_i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i32_1: ; GFX11: ; %bb.0: @@ -207,19 +155,12 @@ define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { } define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fabs_select_i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i32_2: ; GFX11: ; %bb.0: @@ -234,23 +175,14 @@ define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fabs_select_v2i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_v2i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, |v2|, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_v2i32_1: ; GFX11: ; %bb.0: @@ -267,23 +199,14 @@ define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> % } define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fabs_select_v2i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_v2i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v2|, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_v2i32_2: ; GFX11: ; %bb.0: @@ -300,23 +223,14 @@ define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> % } define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fneg_select_v2i32: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_v2i32: ; GFX11: ; %bb.0: @@ -334,19 +248,12 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) } define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_fabs_select_i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i32_1: ; GFX11: ; %bb.0: @@ -361,19 +268,12 @@ define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { } define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { -; GFX7-LABEL: fneg_fabs_select_i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i32_2: ; GFX11: ; %bb.0: @@ -388,23 +288,14 @@ define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { } define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fneg_fabs_select_v2i32_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_v2i32_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_v2i32_1: ; GFX11: ; %bb.0: @@ -421,23 +312,14 @@ define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i } define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GFX7-LABEL: fneg_fabs_select_v2i32_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_v2i32_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_v2i32_2: ; GFX11: ; %bb.0: @@ -454,23 +336,14 @@ define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i } define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fneg_select_i64_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_i64_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_1: ; GFX11: ; %bb.0: @@ -487,23 +360,14 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { } define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fneg_select_i64_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_select_i64_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_2: ; GFX11: ; %bb.0: @@ -520,25 +384,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { } define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fneg_1_fabs_2_select_i64: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_1_fabs_2_select_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_1_fabs_2_select_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_1_fabs_2_select_i64: ; GFX11: ; %bb.0: @@ -557,23 +411,14 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { } define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fabs_select_i64_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_i64_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_1: ; GFX11: ; %bb.0: @@ -590,23 +435,14 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { } define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fabs_select_i64_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fabs_select_i64_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fabs_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_2: ; GFX11: ; %bb.0: @@ -623,23 +459,14 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { } define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fneg_fabs_select_i64_1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_i64_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_1: ; GFX11: ; %bb.0: @@ -656,23 +483,14 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { } define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { -; GFX7-LABEL: fneg_fabs_select_i64_2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX7-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fneg_fabs_select_i64_2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fneg_fabs_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_2: ; GFX11: ; %bb.0: @@ -687,5 +505,174 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { %select = select i1 %cmp, i64 %b, i64 %neg.a ret i64 %select } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} +define i16 @fneg_select_i16_1(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %b + ret i16 %select +} + +define i16 @fneg_select_i16_2(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_2: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_2: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %b, i16 %neg.a + ret i16 %select +} + +define i16 @fneg_select_i16_both(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_both: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_both: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_both: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_both: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %neg.b = xor i16 %b, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %neg.b + ret i16 %select +} + +define i16 @fneg_1_fabs_2_select_i16(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_1_fabs_2_select_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_1_fabs_2_select_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_1_fabs_2_select_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_1_fabs_2_select_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %abs.b = and i16 %a, u0x7fff + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %abs.b + ret i16 %select +} + + From a8bd72617bdf8c5616712b5d801f3b0c5a37fe53 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 8 Jul 2025 11:22:37 -0500 Subject: [PATCH 15/29] Inline bitcast node creation. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e6b611eda3a10..282c22930c709 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4865,22 +4865,24 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N, EVT VT = RHS.getValueType(); EVT FVT = getFloatVT(VT); SDLoc SL = SDLoc(N); - SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS); switch (Opc) { case ISD::XOR: if (CRHS->getAPIntValue().isSignMask()) - return DAG.getNode(ISD::FNEG, SL, FVT, BC); + return DAG.getNode(ISD::FNEG, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); break; case ISD::OR: if (CRHS->getAPIntValue().isSignMask()) { - SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC); + SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); return DAG.getNode(ISD::FNEG, SL, FVT, Abs); } break; case ISD::AND: if (CRHS->getAPIntValue().isMaxSignedValue()) - return DAG.getNode(ISD::FABS, SL, FVT, BC); + return DAG.getNode(ISD::FABS, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); break; default: return SDValue(); From e5f1e67ee8968af97d61c330aa641eeb662e2f8f Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Fri, 11 Jul 2025 06:49:20 -0500 Subject: [PATCH 16/29] Add functional implementation for i64 While this is functional it can be refactored and simplified, working on this now. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 103 ++++++++++++++++-- .../AMDGPU/integer-select-source-modifiers.ll | 70 +++++------- 2 files changed, 122 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 282c22930c709..ef2e9c25f352e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4931,23 +4931,112 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return MinMax; } - // Support source modifiers on integer types. - if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) { - SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI); - SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI); + auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue { + SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI); + SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI); if (SrcModTrue || SrcModFalse) { SDLoc SL(N); EVT FVT = SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); SDValue FLHS = - SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True); + SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS); SDValue FRHS = SrcModFalse ? SrcModFalse - : DAG.getNode(ISD::BITCAST, SL, FVT, False); + : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS); return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); + } + return SDValue(); + }; + + // Support source modifiers on integer operands. + if (VT == MVT::i32 || VT == MVT::v2i32) + if (SDValue F = FoldSrcMods(True, False, VT)) + return F; + + // For i64 if a source modifier is to be folded in we split into two i32 + // select of high and low values. The Operator need only be applied to the + // high values in order to change the sign bit. + if (VT == MVT::i64) { + bool TrueHasModifierOp = + (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR || + True.getOpcode() == ISD::XOR); + + bool FalseHasModifierOp = + (False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR || + False.getOpcode() == ISD::XOR); + + ConstantSDNode *CTrueRHS = nullptr; + if (TrueHasModifierOp) { + SDValue TrueRHS = True->getOperand(1); + CTrueRHS = dyn_cast(TrueRHS); + } + + ConstantSDNode *CFalseRHS = nullptr; + if (FalseHasModifierOp) { + SDValue FalseRHS = False->getOperand(1); + CFalseRHS = dyn_cast(FalseRHS); + } + + // If True or False is a candidate for source modifier folding, extract + // the high value using APInt and reconstruct a ConstantSDNode. + SDValue TrueHiOp; + SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True); + SDValue TrueLo; + SDValue TrueHi; + if (CTrueRHS) { + SDValue TrueLHS = True->getOperand(0); + SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG); + TrueLo = getLoHalf64(TrueLHS, DAG); + APInt CTrueRHSHiBits = + CTrueRHS->getAPIntValue().getHiBits(32).trunc(32); + SDValue CTrueRHSHiVal = + DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32); + unsigned OpcTrue = True.getOpcode(); + TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal, + CTrueRHSHiVal); + } else { + TrueLo = getLoHalf64(BCTrue, DAG); + TrueHi = getHiHalf64(BCTrue, DAG); + } + + SDValue FalseHiOp; + SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False); + SDValue FalseLo; + SDValue FalseHi; + if (CFalseRHS) { + SDValue FalseLHS = False->getOperand(0); + FalseLo = getLoHalf64(FalseLHS, DAG); + SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG); + APInt CFalseRHSHiBits = + CFalseRHS->getAPIntValue().getHiBits(32).trunc(32); + SDValue CFalseRHSHiVal = + DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32); + unsigned OpcFalse = False.getOpcode(); + FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal, + CFalseRHSHiVal); + } else { + FalseLo = getLoHalf64(BCFalse, DAG); + FalseHi = getHiHalf64(BCFalse, DAG); + } + + if (CTrueRHS || CFalseRHS) { + // Place the low bits directly into the select. The operator is unneeded + // for these. + SDValue LoSelect = + DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo); + // If a source modifier may be folded use the bitwise-op of the high + // values, otherwise just pass the high part of the value. + SDValue FoldedHi = + FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi, + CFalseRHS ? FalseHiOp : FalseHi, MVT::i32); + + SDValue ResV = + DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect}); + SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV); + return Res; } } - } +} // There's no reason to not do this if the condition has other uses. return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index eed83dd905c38..c3ce0d1aa739e 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -340,18 +340,15 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -364,18 +361,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -388,20 +382,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_1_fabs_2_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %abs.b = and i64 %b, u0x7fffffffffffffff @@ -415,18 +405,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i64 %a, u0x7fffffffffffffff %cmp = icmp eq i64 %cond, zeroinitializer @@ -439,18 +427,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i64 %a, u0x7fffffffffffffff %cmp = icmp eq i64 %cond, zeroinitializer @@ -463,18 +449,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -487,18 +471,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_or_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer From 767cc114869c7ac835a7767517e63092275cf7ef Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Fri, 11 Jul 2025 12:27:01 -0500 Subject: [PATCH 17/29] Fix formatting --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ef2e9c25f352e..a61f7ccc8b956 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4940,22 +4940,23 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); SDValue FLHS = SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS); - SDValue FRHS = SrcModFalse ? SrcModFalse - : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); + SDValue FRHS = + SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS); return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); - } - return SDValue(); - }; + } + return SDValue(); + }; // Support source modifiers on integer operands. if (VT == MVT::i32 || VT == MVT::v2i32) if (SDValue F = FoldSrcMods(True, False, VT)) return F; - // For i64 if a source modifier is to be folded in we split into two i32 - // select of high and low values. The Operator need only be applied to the - // high values in order to change the sign bit. + // auto SplitSelect = [&]() -> std::pair( + // For i64 if a source modifier is to be folded in we split into two i32 + // select of high and low values. The Operator need only be applied to the + // high values in order to change the sign bit. if (VT == MVT::i64) { bool TrueHasModifierOp = (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR || @@ -5036,7 +5037,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return Res; } } -} + } // There's no reason to not do this if the condition has other uses. return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); From 686919f62711e0c6536d01c1e0c81bc457b28598 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 18:41:54 -0500 Subject: [PATCH 18/29] [DAGCombine] Move the AMDGPU combine to Target Indepenent DAGCombine - Allows removal of i64 specific code - the TI combine splits to i32 ops. - Update quite a few AMDGPU tests, these all appear to be improvements in codegen. Need to double-check. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 75 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 89 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 + llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 + .../atomic_optimizations_global_pointer.ll | 18 +- .../branch-folding-implicit-def-subreg.ll | 18 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 2219 +++++++++-------- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 302 ++- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 7 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 58 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 133 +- .../AMDGPU/fptrunc.v2f16.no.fast.math.ll | 64 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 462 ++-- .../AMDGPU/integer-select-source-modifiers.ll | 54 +- llvm/test/CodeGen/AMDGPU/saddsat.ll | 52 +- .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 7 +- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 378 ++- 17 files changed, 2016 insertions(+), 1932 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 231184587d682..4f58ffa47fd20 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -684,7 +684,7 @@ namespace { SDValue VecIn2, unsigned LeftIdx, bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); - + SDValue getBitwiseToSrcModifierOp(SDValue N); /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, @@ -12175,6 +12175,56 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } +static EVT getFloatVT(EVT VT) { + EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits()); + return VT.isVector() ? VT.changeVectorElementType(FT) : FT; +} + +SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) { + + unsigned Opc = N.getNode()->getOpcode(); + if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS)) + return SDValue(); + + ConstantSDNode *CRHS = isConstOrConstSplat(RHS); + + if (!CRHS) + return SDValue(); + + EVT VT = RHS.getValueType(); + EVT FVT = getFloatVT(VT); + SDLoc SL = SDLoc(N); + + switch (Opc) { + case ISD::XOR: + if (CRHS->getAPIntValue().isSignMask()) + return DAG.getNode(ISD::FNEG, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); + break; + case ISD::OR: + if (CRHS->getAPIntValue().isSignMask()) { + SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); + return DAG.getNode(ISD::FNEG, SL, FVT, Abs); + } + break; + case ISD::AND: + if (CRHS->getAPIntValue().isMaxSignedValue()) + return DAG.getNode(ISD::FABS, SL, FVT, + DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); + break; + default: + return SDValue(); + } + return SDValue(); +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12390,6 +12440,29 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) return R; + auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue { + SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS); + SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS); + if (SrcModTrue || SrcModFalse) { + SDLoc SL(N); + EVT FVT = + SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); + SDValue FLHS = + SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS); + SDValue FRHS = + SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS); + return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); + } + return SDValue(); + }; + + // Identify bitmask operations that are source mods and create + // the relevant fneg, fabs or fneg+fabs. + if (VT == MVT::i32 || VT == MVT::v2i32) + if (SDValue F = FoldSrcMods(N1, N2, VT)) + return F; + return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a61f7ccc8b956..7436de2d6a6a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4948,95 +4948,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return SDValue(); }; - // Support source modifiers on integer operands. - if (VT == MVT::i32 || VT == MVT::v2i32) - if (SDValue F = FoldSrcMods(True, False, VT)) - return F; - - // auto SplitSelect = [&]() -> std::pair( - // For i64 if a source modifier is to be folded in we split into two i32 - // select of high and low values. The Operator need only be applied to the - // high values in order to change the sign bit. - if (VT == MVT::i64) { - bool TrueHasModifierOp = - (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR || - True.getOpcode() == ISD::XOR); - - bool FalseHasModifierOp = - (False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR || - False.getOpcode() == ISD::XOR); - - ConstantSDNode *CTrueRHS = nullptr; - if (TrueHasModifierOp) { - SDValue TrueRHS = True->getOperand(1); - CTrueRHS = dyn_cast(TrueRHS); - } - - ConstantSDNode *CFalseRHS = nullptr; - if (FalseHasModifierOp) { - SDValue FalseRHS = False->getOperand(1); - CFalseRHS = dyn_cast(FalseRHS); - } - - // If True or False is a candidate for source modifier folding, extract - // the high value using APInt and reconstruct a ConstantSDNode. - SDValue TrueHiOp; - SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True); - SDValue TrueLo; - SDValue TrueHi; - if (CTrueRHS) { - SDValue TrueLHS = True->getOperand(0); - SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG); - TrueLo = getLoHalf64(TrueLHS, DAG); - APInt CTrueRHSHiBits = - CTrueRHS->getAPIntValue().getHiBits(32).trunc(32); - SDValue CTrueRHSHiVal = - DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32); - unsigned OpcTrue = True.getOpcode(); - TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal, - CTrueRHSHiVal); - } else { - TrueLo = getLoHalf64(BCTrue, DAG); - TrueHi = getHiHalf64(BCTrue, DAG); - } - - SDValue FalseHiOp; - SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False); - SDValue FalseLo; - SDValue FalseHi; - if (CFalseRHS) { - SDValue FalseLHS = False->getOperand(0); - FalseLo = getLoHalf64(FalseLHS, DAG); - SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG); - APInt CFalseRHSHiBits = - CFalseRHS->getAPIntValue().getHiBits(32).trunc(32); - SDValue CFalseRHSHiVal = - DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32); - unsigned OpcFalse = False.getOpcode(); - FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal, - CFalseRHSHiVal); - } else { - FalseLo = getLoHalf64(BCFalse, DAG); - FalseHi = getHiHalf64(BCFalse, DAG); - } - - if (CTrueRHS || CFalseRHS) { - // Place the low bits directly into the select. The operator is unneeded - // for these. - SDValue LoSelect = - DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo); - // If a source modifier may be folded use the bitwise-op of the high - // values, otherwise just pass the high part of the value. - SDValue FoldedHi = - FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi, - CFalseRHS ? FalseHiOp : FalseHi, MVT::i32); - - SDValue ResV = - DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect}); - SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV); - return Res; - } - } } // There's no reason to not do this if the condition has other uses. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e90316cee12fe..77632c1423f4e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15493,6 +15493,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, return SDValue(); } +bool SITargetLowering::shouldFoldSelectWithIdentityConstant( + unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, + SDValue Y) const { + return (BinOpcode == ISD::AND || BinOpcode == ISD::OR || + BinOpcode == ISD::XOR) && + (VT.getScalarType() == MVT::i32); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158572a4d..f118bc37b9224 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override; + bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, + unsigned SelectOpcode, SDValue X, + SDValue Y) const override; + private: // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store // the three offsets (voffset, soffset and instoffset) into the SDValue[3] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 3ca7db155b385..7584d3eb12928 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX7LESS-NEXT: s_or_b32 s5, s4, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -8838,12 +8839,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX7LESS-NEXT: s_or_b32 s5, s4, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 92c63fead15ac..50efed6da381b 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec - ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec + ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index ba4fe3685458d..d52fe845d62ec 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -275,14 +275,23 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) { ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: s_test_copysign_f16_10_mag: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s0, 0x8000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, 0x4900 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: s_test_copysign_f16_10_mag: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x8000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, 0x4900 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: s_test_copysign_f16_10_mag: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x8000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, 0x4900 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half 10.0, half %sign) %cast = bitcast half %result to i16 ret i16 %cast @@ -864,21 +873,20 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_med3_i32 v5, v5, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NEXT: v_or_b32_e32 v7, 1, v6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v5, v0, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-NEXT: v_and_b32_e32 v5, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 @@ -914,21 +922,20 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v1 -; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v3 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -960,19 +967,18 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX9-NEXT: v_med3_i32 v4, v4, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, v4, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v1, 0xfffffc10, v1 -; GFX9-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_lshl_or_b32 v4, v1, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v4, 7, v3 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -1002,36 +1008,35 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 ; GFX11-NEXT: v_med3_i32 v3, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 12, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_lshl_or_b32 v4, v1, 12, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc double %mag to half @@ -1057,29 +1062,31 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI-NEXT: s_or_b32 s3, s0, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s5, v0 ; SI-NEXT: s_lshr_b32 s6, s3, s5 +; SI-NEXT: s_or_b32 s7, s6, 1 ; SI-NEXT: s_lshl_b32 s5, s6, s5 ; SI-NEXT: s_cmp_lg_u32 s5, s3 -; SI-NEXT: s_cselect_b32 s3, 1, 0 -; SI-NEXT: s_addk_i32 s4, 0xfc10 -; SI-NEXT: s_lshl_b32 s5, s4, 12 -; SI-NEXT: s_or_b32 s3, s6, s3 -; SI-NEXT: s_or_b32 s5, s0, s5 -; SI-NEXT: s_cmp_lt_i32 s4, 1 -; SI-NEXT: s_cselect_b32 s3, s3, s5 -; SI-NEXT: s_and_b32 s5, s3, 7 -; SI-NEXT: s_cmp_gt_i32 s5, 5 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 -; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cselect_b32 s3, s7, s6 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc10 +; SI-NEXT: s_lshl_b32 s4, s8, 12 +; SI-NEXT: s_or_b32 s4, s0, s4 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s4 +; SI-NEXT: s_and_b32 s6, s3, 7 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 1, 0 ; SI-NEXT: s_lshr_b32 s3, s3, 2 -; SI-NEXT: s_add_i32 s3, s3, s5 -; SI-NEXT: s_cmp_lt_i32 s4, 31 +; SI-NEXT: s_add_i32 s3, s3, s4 +; SI-NEXT: s_cmp_lt_i32 s8, 31 ; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s0, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f ; SI-NEXT: s_cselect_b32 s0, s0, s3 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 @@ -1104,35 +1111,37 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI-NEXT: s_and_b32 s0, s0, 0xffe ; VI-NEXT: v_readfirstlane_b32 s3, v0 ; VI-NEXT: s_sub_i32 s4, 0x3f1, s1 -; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_or_b32 s3, s0, s3 ; VI-NEXT: v_med3_i32 v0, s4, 0, 13 -; VI-NEXT: s_or_b32 s3, s0, 0x1000 +; VI-NEXT: s_or_b32 s0, s3, 0x1000 ; VI-NEXT: v_readfirstlane_b32 s4, v0 -; VI-NEXT: s_lshr_b32 s5, s3, s4 +; VI-NEXT: s_lshr_b32 s5, s0, s4 +; VI-NEXT: s_or_b32 s6, s5, 1 ; VI-NEXT: s_lshl_b32 s4, s5, s4 -; VI-NEXT: s_cmp_lg_u32 s4, s3 -; VI-NEXT: s_cselect_b32 s3, 1, 0 -; VI-NEXT: s_addk_i32 s1, 0xfc10 -; VI-NEXT: s_lshl_b32 s4, s1, 12 -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_or_b32 s4, s0, s4 -; VI-NEXT: s_cmp_lt_i32 s1, 1 -; VI-NEXT: s_cselect_b32 s3, s3, s4 -; VI-NEXT: s_and_b32 s4, s3, 7 -; VI-NEXT: s_cmp_gt_i32 s4, 5 -; VI-NEXT: s_cselect_b32 s5, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, s0 +; VI-NEXT: s_cselect_b32 s0, s6, s5 +; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10 +; VI-NEXT: s_lshl_b32 s1, s6, 12 +; VI-NEXT: s_or_b32 s1, s3, s1 +; VI-NEXT: s_cmp_lt_i32 s6, 1 +; VI-NEXT: s_cselect_b32 s7, s0, s1 +; VI-NEXT: s_and_b32 s4, s7, 7 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: s_cselect_b32 s4, 1, 0 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshr_b32 s3, s3, 2 -; VI-NEXT: s_add_i32 s3, s3, s4 -; VI-NEXT: s_cmp_lt_i32 s1, 31 -; VI-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_movk_i32 s0, 0x7e00 -; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f -; VI-NEXT: s_cselect_b32 s0, s0, s3 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_gt_i32 s4, 5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; VI-NEXT: s_cselect_b32 s0, 1, 0 +; VI-NEXT: s_lshr_b32 s1, s7, 2 +; VI-NEXT: s_add_i32 s1, s1, s0 +; VI-NEXT: s_cmp_lt_i32 s6, 31 +; VI-NEXT: s_cselect_b32 s0, s1, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s3, 0 +; VI-NEXT: s_movk_i32 s1, 0x7e00 +; VI-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; VI-NEXT: s_cselect_b32 s0, s1, s0 ; VI-NEXT: s_movk_i32 s1, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1152,35 +1161,37 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9-NEXT: s_and_b32 s0, s0, 0xffe ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s1 -; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: s_or_b32 s3, s0, s3 ; GFX9-NEXT: v_med3_i32 v0, s4, 0, 13 -; GFX9-NEXT: s_or_b32 s3, s0, 0x1000 +; GFX9-NEXT: s_or_b32 s0, s3, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_lshr_b32 s5, s3, s4 +; GFX9-NEXT: s_lshr_b32 s5, s0, s4 +; GFX9-NEXT: s_or_b32 s6, s5, 1 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_addk_i32 s1, 0xfc10 -; GFX9-NEXT: s_lshl_b32 s4, s1, 12 -; GFX9-NEXT: s_or_b32 s3, s5, s3 -; GFX9-NEXT: s_or_b32 s4, s0, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, 1 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_and_b32 s4, s3, 7 -; GFX9-NEXT: s_cmp_gt_i32 s4, 5 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s4, s0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s1, 0xfffffc10 +; GFX9-NEXT: s_lshl_b32 s1, s6, 12 +; GFX9-NEXT: s_or_b32 s1, s3, s1 +; GFX9-NEXT: s_cmp_lt_i32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s7, s0, s1 +; GFX9-NEXT: s_and_b32 s4, s7, 7 ; GFX9-NEXT: s_cmp_eq_u32 s4, 3 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s3, s3, 2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, 31 -; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x7e00 -; GFX9-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s1, 0x40f -; GFX9-NEXT: s_cselect_b32 s0, s0, s3 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 s4, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9-NEXT: s_lshr_b32 s1, s7, 2 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_cmp_lt_i32 s6, 31 +; GFX9-NEXT: s_cselect_b32 s0, s1, 0x7c00 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: s_movk_i32 s1, 0x7e00 +; GFX9-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x40f +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1188,59 +1199,120 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s3, s0 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014 -; GFX11-NEXT: s_lshr_b32 s1, s1, 8 -; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s0 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffe -; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: s_or_b32 s1, s1, s3 -; GFX11-NEXT: s_or_b32 s3, s1, 0x1000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s5, s3, s4 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, s3 -; GFX11-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-NEXT: s_or_b32 s3, s5, s3 -; GFX11-NEXT: s_lshl_b32 s4, s0, 12 -; GFX11-NEXT: s_or_b32 s4, s1, s4 -; GFX11-NEXT: s_cmp_lt_i32 s0, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s3, 7 -; GFX11-NEXT: s_cmp_gt_i32 s4, 5 -; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s3, s3, s4 -; GFX11-NEXT: s_cmp_lt_i32 s0, 31 -; GFX11-NEXT: s_movk_i32 s4, 0x7e00 -; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, s4, 0x7c00 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x40f -; GFX11-NEXT: s_cselect_b32 s0, s1, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffe +; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s5, 1 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, s3 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s6, s5 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0xfc10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s0, 12 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s1, s4 +; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 7 +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s4, 5 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 31 +; GFX11-TRUE16-NEXT: s_movk_i32 s4, 0x7e00 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 +; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s0, 0x40f +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffe +; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s5, 1 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, s3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s6, s5 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0xfc10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s0, 12 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s1, s4 +; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 7 +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, -1, 0 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s4, 5 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 31 +; GFX11-FAKE16-NEXT: s_movk_i32 s4, 0x7e00 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 +; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s0, 0x40f +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) %cast = bitcast half %result to i16 @@ -3029,28 +3101,27 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: s_movk_i32 s6, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s6, v7 ; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 ; SI-NEXT: v_med3_i32 v8, v8, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v10, 1, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: s_movk_i32 s7, 0xfc10 +; SI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v8, v2, v8 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_and_b32_e32 v8, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 @@ -3058,9 +3129,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: s_movk_i32 s8, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1 @@ -3073,25 +3144,24 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v6, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s6, v6 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; SI-NEXT: v_or_b32_e32 v11, 1, v10 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v7, v0, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 @@ -3100,7 +3170,7 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -3124,28 +3194,27 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 -; VI-NEXT: s_movk_i32 s4, 0x3f1 +; VI-NEXT: s_movk_i32 s6, 0x3f1 ; VI-NEXT: v_or_b32_e32 v2, v5, v2 -; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3 +; VI-NEXT: v_sub_u32_e32 v6, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v5 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; VI-NEXT: v_or_b32_e32 v8, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v5 -; VI-NEXT: s_movk_i32 s5, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 +; VI-NEXT: s_movk_i32 s7, 0xfc10 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 -; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_or_b32_e32 v6, v2, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v5 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 @@ -3153,9 +3222,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; VI-NEXT: v_mov_b32_e32 v7, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_movk_i32 s6, 0x40f +; VI-NEXT: s_movk_i32 s8, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 @@ -3165,32 +3234,31 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v1 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s6, v1 ; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; VI-NEXT: v_med3_i32 v5, v5, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_e32 v9, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 -; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 ; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v1 -; VI-NEXT: v_or_b32_e32 v3, v8, v3 ; VI-NEXT: v_or_b32_e32 v5, v0, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; VI-NEXT: v_and_b32_e32 v5, 7, v3 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -3202,32 +3270,31 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x1ff -; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX9-NEXT: s_movk_i32 s6, 0x1ff +; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: s_movk_i32 s5, 0xffe +; GFX9-NEXT: s_movk_i32 s7, 0xffe ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0 +; GFX9-NEXT: v_and_or_b32 v0, v5, s7, v0 ; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6 ; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 1, v8 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -3235,47 +3302,46 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_movk_i32 s6, 0x40f +; GFX9-NEXT: s_movk_i32 s8, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_mov_b32 s7, 0x8000 -; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 +; GFX9-NEXT: s_mov_b32 s9, 0x8000 +; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v6, v6, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, v6, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 1, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc ; GFX9-NEXT: v_lshl_or_b32 v6, v5, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff @@ -3289,12 +3355,11 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 20, 11 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 20, 11 -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 20, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v5, v2 @@ -3311,61 +3376,59 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 1, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 1, v12 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v6, 12, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v11, v13 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v7, 12, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 12, v2 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v7, 12, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v10, v5 :: v_dual_mov_b32 v10, 0x7e00 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 7, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 7, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 7, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v10 :: v_dual_add_nc_u32 v5, v5, v11 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v11 :: v_dual_add_nc_u32 v5, v5, v12 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v12, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v9, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v12, v0 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v9, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 @@ -3378,17 +3441,15 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 20, 11 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, 0x7e00 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 20, 11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v7 @@ -3401,62 +3462,59 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 1, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 1, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v6, 12, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v14 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v7, 12, v2 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v6, 12, v0 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v7, 12, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 7, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, 0x7c00, v8 :: v_dual_add_nc_u32 v5, v5, v10 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc <2 x double> %mag to <2 x half> @@ -3853,78 +3911,82 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_or_b32 s4, s0, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_lshr_b32 s7, s4, s6 +; SI-NEXT: s_or_b32 s8, s7, 1 ; SI-NEXT: s_lshl_b32 s6, s7, s6 ; SI-NEXT: s_cmp_lg_u32 s6, s4 -; SI-NEXT: s_cselect_b32 s4, 1, 0 -; SI-NEXT: s_addk_i32 s5, 0xfc10 -; SI-NEXT: s_lshl_b32 s6, s5, 12 -; SI-NEXT: s_or_b32 s4, s7, s4 -; SI-NEXT: s_or_b32 s6, s0, s6 -; SI-NEXT: s_cmp_lt_i32 s5, 1 -; SI-NEXT: s_cselect_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s4, 7 -; SI-NEXT: s_cmp_gt_i32 s6, 5 -; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cselect_b32 s4, s8, s7 +; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10 +; SI-NEXT: s_lshl_b32 s5, s8, 12 +; SI-NEXT: s_or_b32 s5, s0, s5 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s6, s9, 7 ; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_lshr_b32 s4, s4, 2 -; SI-NEXT: s_add_i32 s4, s4, s6 -; SI-NEXT: s_cmp_lt_i32 s5, 31 -; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_lshr_b32 s5, s9, 2 +; SI-NEXT: s_add_i32 s5, s5, s4 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s4, s5, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f ; SI-NEXT: s_cselect_b32 s0, s0, s4 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s4, s1, s0 +; SI-NEXT: s_or_b32 s7, s1, s0 ; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s5, s0, 0xffe +; SI-NEXT: s_and_b32 s4, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; SI-NEXT: s_or_b32 s0, s5, s0 -; SI-NEXT: s_sub_i32 s5, 0x3f1, s2 -; SI-NEXT: v_med3_i32 v2, s5, 0, 13 -; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: s_lshr_b32 s7, s1, s5 -; SI-NEXT: s_lshl_b32 s5, s7, s5 -; SI-NEXT: s_cmp_lg_u32 s5, s1 -; SI-NEXT: s_cselect_b32 s1, 1, 0 -; SI-NEXT: s_addk_i32 s2, 0xfc10 -; SI-NEXT: s_lshl_b32 s5, s2, 12 -; SI-NEXT: s_or_b32 s1, s7, s1 -; SI-NEXT: s_or_b32 s5, s0, s5 -; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s5 -; SI-NEXT: s_and_b32 s5, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s5, 5 -; SI-NEXT: s_cselect_b32 s7, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 -; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s5 -; SI-NEXT: s_cmp_lt_i32 s2, 31 -; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s1 +; SI-NEXT: s_bfe_u32 s1, s3, 0xb0014 +; SI-NEXT: s_or_b32 s2, s4, s0 +; SI-NEXT: s_sub_i32 s4, 0x3f1, s1 +; SI-NEXT: v_med3_i32 v2, s4, 0, 13 +; SI-NEXT: s_or_b32 s0, s2, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_lshr_b32 s5, s0, s4 +; SI-NEXT: s_or_b32 s8, s5, 1 +; SI-NEXT: s_lshl_b32 s4, s5, s4 +; SI-NEXT: s_cmp_lg_u32 s4, s0 +; SI-NEXT: s_cselect_b32 s0, s8, s5 +; SI-NEXT: s_add_i32 s8, s1, 0xfffffc10 +; SI-NEXT: s_lshl_b32 s1, s8, 12 +; SI-NEXT: s_or_b32 s1, s2, s1 +; SI-NEXT: s_cmp_lt_i32 s8, 1 +; SI-NEXT: s_cselect_b32 s9, s0, s1 +; SI-NEXT: s_and_b32 s4, s9, 7 +; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_gt_i32 s4, 5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 1, 0 +; SI-NEXT: s_lshr_b32 s1, s9, 2 +; SI-NEXT: s_add_i32 s1, s1, s0 +; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_cselect_b32 s0, s1, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cselect_b32 s1, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cselect_b32 s0, s1, s0 ; SI-NEXT: s_lshr_b32 s1, s3, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v0, s0, v2, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3947,36 +4009,38 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s2, v0 ; VI-NEXT: s_sub_i32 s6, 0x3f1, s3 -; VI-NEXT: s_or_b32 s2, s5, s2 +; VI-NEXT: s_or_b32 s5, s5, s2 ; VI-NEXT: v_med3_i32 v0, s6, 0, 13 -; VI-NEXT: s_or_b32 s5, s2, 0x1000 +; VI-NEXT: s_or_b32 s2, s5, 0x1000 ; VI-NEXT: v_readfirstlane_b32 s6, v0 -; VI-NEXT: s_lshr_b32 s7, s5, s6 +; VI-NEXT: s_lshr_b32 s7, s2, s6 +; VI-NEXT: s_or_b32 s8, s7, 1 ; VI-NEXT: s_lshl_b32 s6, s7, s6 -; VI-NEXT: s_cmp_lg_u32 s6, s5 -; VI-NEXT: s_cselect_b32 s5, 1, 0 -; VI-NEXT: s_addk_i32 s3, 0xfc10 -; VI-NEXT: s_lshl_b32 s6, s3, 12 -; VI-NEXT: s_or_b32 s5, s7, s5 -; VI-NEXT: s_or_b32 s6, s2, s6 -; VI-NEXT: s_cmp_lt_i32 s3, 1 -; VI-NEXT: s_cselect_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s5, 7 -; VI-NEXT: s_cmp_gt_i32 s6, 5 -; VI-NEXT: s_cselect_b32 s7, 1, 0 +; VI-NEXT: s_cmp_lg_u32 s6, s2 +; VI-NEXT: s_cselect_b32 s2, s8, s7 +; VI-NEXT: s_add_i32 s8, s3, 0xfffffc10 +; VI-NEXT: s_lshl_b32 s3, s8, 12 +; VI-NEXT: s_or_b32 s3, s5, s3 +; VI-NEXT: s_cmp_lt_i32 s8, 1 +; VI-NEXT: s_cselect_b32 s9, s2, s3 +; VI-NEXT: s_and_b32 s6, s9, 7 ; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: s_cselect_b32 s6, 1, 0 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_lshr_b32 s5, s5, 2 -; VI-NEXT: s_add_i32 s5, s5, s6 -; VI-NEXT: s_cmp_lt_i32 s3, 31 -; VI-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_movk_i32 s6, 0x7e00 -; VI-NEXT: s_cselect_b32 s2, s6, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s3, 0x40f -; VI-NEXT: s_cselect_b32 s2, s2, s5 -; VI-NEXT: s_lshl_b32 s5, s2, 16 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_gt_i32 s6, 5 +; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 +; VI-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; VI-NEXT: s_cselect_b32 s2, 1, 0 +; VI-NEXT: s_lshr_b32 s3, s9, 2 +; VI-NEXT: s_add_i32 s3, s3, s2 +; VI-NEXT: s_cmp_lt_i32 s8, 31 +; VI-NEXT: s_cselect_b32 s2, s3, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s5, 0 +; VI-NEXT: s_movk_i32 s5, 0x7e00 +; VI-NEXT: s_cselect_b32 s3, s5, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; VI-NEXT: s_cselect_b32 s2, s3, s2 +; VI-NEXT: s_lshl_b32 s6, s2, 16 ; VI-NEXT: s_lshr_b32 s2, s1, 8 ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff @@ -3986,37 +4050,39 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_sub_i32 s3, 0x3f1, s1 -; VI-NEXT: s_or_b32 s0, s7, s0 -; VI-NEXT: v_med3_i32 v0, s3, 0, 13 -; VI-NEXT: s_or_b32 s2, s0, 0x1000 -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshr_b32 s7, s2, s3 -; VI-NEXT: s_lshl_b32 s3, s7, s3 -; VI-NEXT: s_cmp_lg_u32 s3, s2 -; VI-NEXT: s_cselect_b32 s2, 1, 0 -; VI-NEXT: s_addk_i32 s1, 0xfc10 -; VI-NEXT: s_lshl_b32 s3, s1, 12 -; VI-NEXT: s_or_b32 s2, s7, s2 -; VI-NEXT: s_or_b32 s3, s0, s3 -; VI-NEXT: s_cmp_lt_i32 s1, 1 -; VI-NEXT: s_cselect_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s3, s2, 7 -; VI-NEXT: s_cmp_gt_i32 s3, 5 -; VI-NEXT: s_cselect_b32 s7, 1, 0 -; VI-NEXT: s_cmp_eq_u32 s3, 3 -; VI-NEXT: s_cselect_b32 s3, 1, 0 -; VI-NEXT: s_or_b32 s3, s3, s7 -; VI-NEXT: s_lshr_b32 s2, s2, 2 -; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_cmp_lt_i32 s1, 31 -; VI-NEXT: s_cselect_b32 s2, s2, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f -; VI-NEXT: s_cselect_b32 s0, s0, s2 +; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 +; VI-NEXT: s_or_b32 s7, s7, s0 +; VI-NEXT: v_med3_i32 v0, s2, 0, 13 +; VI-NEXT: s_or_b32 s0, s7, 0x1000 +; VI-NEXT: v_readfirstlane_b32 s2, v0 +; VI-NEXT: s_lshr_b32 s3, s0, s2 +; VI-NEXT: s_or_b32 s8, s3, 1 +; VI-NEXT: s_lshl_b32 s2, s3, s2 +; VI-NEXT: s_cmp_lg_u32 s2, s0 +; VI-NEXT: s_cselect_b32 s0, s8, s3 +; VI-NEXT: s_add_i32 s8, s1, 0xfffffc10 +; VI-NEXT: s_lshl_b32 s1, s8, 12 +; VI-NEXT: s_or_b32 s1, s7, s1 +; VI-NEXT: s_cmp_lt_i32 s8, 1 +; VI-NEXT: s_cselect_b32 s9, s0, s1 +; VI-NEXT: s_and_b32 s2, s9, 7 +; VI-NEXT: s_cmp_eq_u32 s2, 3 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_gt_i32 s2, 5 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; VI-NEXT: s_cselect_b32 s0, 1, 0 +; VI-NEXT: s_lshr_b32 s1, s9, 2 +; VI-NEXT: s_add_i32 s1, s1, s0 +; VI-NEXT: s_cmp_lt_i32 s8, 31 +; VI-NEXT: s_cselect_b32 s0, s1, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s7, 0 +; VI-NEXT: s_cselect_b32 s1, s5, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; VI-NEXT: s_cselect_b32 s0, s1, s0 ; VI-NEXT: s_and_b32 s0, s0, 0x7fff -; VI-NEXT: s_or_b32 s0, s0, s5 +; VI-NEXT: s_or_b32 s0, s0, s6 ; VI-NEXT: s_mov_b32 s1, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4041,29 +4107,31 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_or_b32 s5, s2, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_lshr_b32 s8, s5, s7 +; GFX9-NEXT: s_or_b32 s9, s8, 1 ; GFX9-NEXT: s_lshl_b32 s7, s8, s7 ; GFX9-NEXT: s_cmp_lg_u32 s7, s5 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_addk_i32 s6, 0xfc10 -; GFX9-NEXT: s_lshl_b32 s7, s6, 12 -; GFX9-NEXT: s_or_b32 s5, s8, s5 -; GFX9-NEXT: s_or_b32 s7, s2, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, 1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_and_b32 s7, s5, 7 -; GFX9-NEXT: s_cmp_gt_i32 s7, 5 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cselect_b32 s5, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s6, 0xfffffc10 +; GFX9-NEXT: s_lshl_b32 s6, s10, 12 +; GFX9-NEXT: s_or_b32 s6, s2, s6 +; GFX9-NEXT: s_cmp_lt_i32 s10, 1 +; GFX9-NEXT: s_cselect_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s8, s5, 7 +; GFX9-NEXT: s_cmp_eq_u32 s8, 3 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 s8, 5 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: s_lshr_b32 s5, s5, 2 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_cmp_lt_i32 s10, 31 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_movk_i32 s7, 0x7e00 -; GFX9-NEXT: s_cselect_b32 s2, s7, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x40f +; GFX9-NEXT: s_movk_i32 s8, 0x7e00 +; GFX9-NEXT: s_cselect_b32 s2, s8, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s10, 0x40f ; GFX9-NEXT: s_cselect_b32 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0x8000 @@ -4082,29 +4150,31 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: v_med3_i32 v0, s6, 0, 13 ; GFX9-NEXT: s_or_b32 s2, s0, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_lshr_b32 s8, s2, s6 -; GFX9-NEXT: s_lshl_b32 s6, s8, s6 +; GFX9-NEXT: s_lshr_b32 s7, s2, s6 +; GFX9-NEXT: s_or_b32 s9, s7, 1 +; GFX9-NEXT: s_lshl_b32 s6, s7, s6 ; GFX9-NEXT: s_cmp_lg_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_addk_i32 s3, 0xfc10 -; GFX9-NEXT: s_lshl_b32 s6, s3, 12 -; GFX9-NEXT: s_or_b32 s2, s8, s2 -; GFX9-NEXT: s_or_b32 s6, s0, s6 -; GFX9-NEXT: s_cmp_lt_i32 s3, 1 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_cmp_gt_i32 s6, 5 -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cselect_b32 s2, s9, s7 +; GFX9-NEXT: s_add_i32 s9, s3, 0xfffffc10 +; GFX9-NEXT: s_lshl_b32 s3, s9, 12 +; GFX9-NEXT: s_or_b32 s3, s0, s3 +; GFX9-NEXT: s_cmp_lt_i32 s9, 1 +; GFX9-NEXT: s_cselect_b32 s10, s2, s3 +; GFX9-NEXT: s_and_b32 s6, s10, 7 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: s_lshr_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s2, s2, s6 -; GFX9-NEXT: s_cmp_lt_i32 s3, 31 -; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 s6, 5 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_lshr_b32 s3, s10, 2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_cmp_lt_i32 s9, 31 +; GFX9-NEXT: s_cselect_b32 s2, s3, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s0, s7, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX9-NEXT: s_cselect_b32 s0, s8, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s9, 0x40f ; GFX9-NEXT: s_cselect_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0x8000 @@ -4139,23 +4209,26 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s8, s6, s7 ; GFX11-NEXT: s_lshl_b32 s7, s8, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s9, s8, 1 ; GFX11-NEXT: s_cmp_lg_u32 s7, s6 -; GFX11-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-NEXT: s_cselect_b32 s6, s9, s8 ; GFX11-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-NEXT: s_or_b32 s6, s8, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s7, s2, 12 ; GFX11-NEXT: s_or_b32 s7, s5, s7 ; GFX11-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s7, s6, 7 -; GFX11-NEXT: s_cmp_gt_i32 s7, 5 -; GFX11-NEXT: s_cselect_b32 s8, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s7, 3 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cmp_gt_i32 s7, 5 +; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s7, exec_lo ; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_lshr_b32 s6, s6, 2 -; GFX11-NEXT: s_or_b32 s7, s7, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s6, s6, s7 ; GFX11-NEXT: s_cmp_lt_i32 s2, 31 @@ -4189,23 +4262,26 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s8, s5, s6 ; GFX11-NEXT: s_lshl_b32 s6, s8, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s9, s8, 1 ; GFX11-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_cselect_b32 s5, s9, s8 ; GFX11-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-NEXT: s_or_b32 s5, s8, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s6, s0, 12 ; GFX11-NEXT: s_or_b32 s6, s3, s6 ; GFX11-NEXT: s_cmp_lt_i32 s0, 1 ; GFX11-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-NEXT: s_cselect_b32 s8, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: s_cmp_lt_i32 s0, 31 @@ -4342,15 +4418,27 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_pack_ll_b32_b16 s1, s2, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s2, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.trunc = fptrunc <2 x double> %sign to <2 x half> %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc) %cast = bitcast <2 x half> %out to i32 @@ -4665,28 +4753,27 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_and_b32_e32 v9, 0xffe, v9 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v10, v5, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: s_movk_i32 s6, 0x3f1 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 +; SI-NEXT: v_sub_i32_e32 v11, vcc, s6, v10 ; SI-NEXT: v_or_b32_e32 v9, 0x1000, v4 ; SI-NEXT: v_med3_i32 v11, v11, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v9 ; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 +; SI-NEXT: v_or_b32_e32 v13, 1, v12 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v9 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 +; SI-NEXT: s_movk_i32 s7, 0xfc10 +; SI-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v11, v4, v11 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc ; SI-NEXT: v_and_b32_e32 v11, 7, v9 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v9, 2, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 @@ -4694,9 +4781,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc ; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: s_movk_i32 s8, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v10 ; SI-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3 @@ -4709,32 +4796,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v9, v3, 20, 11 ; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v9 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s6, v9 ; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; SI-NEXT: v_med3_i32 v10, v10, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v5 ; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 +; SI-NEXT: v_or_b32_e32 v14, 1, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, s5, v9 +; SI-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v9 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 ; SI-NEXT: v_or_b32_e32 v10, v2, v10 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v9 ; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc ; SI-NEXT: v_and_b32_e32 v10, 7, v5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v9 ; SI-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v9 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 @@ -4747,25 +4833,24 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v5 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s6, v5 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v9, v9, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v3 ; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 +; SI-NEXT: v_or_b32_e32 v13, 1, v10 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v3 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 +; SI-NEXT: v_cndmask_b32_e32 v3, v10, v13, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v5 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v9, v0, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_and_b32_e32 v9, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 @@ -4773,7 +4858,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -4798,28 +4883,27 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_and_b32_e32 v8, 0xffe, v8 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v5, v5, 20, 11 -; VI-NEXT: s_movk_i32 s4, 0x3f1 +; VI-NEXT: s_movk_i32 s6, 0x3f1 ; VI-NEXT: v_or_b32_e32 v4, v8, v4 -; VI-NEXT: v_sub_u32_e32 v9, vcc, s4, v5 +; VI-NEXT: v_sub_u32_e32 v9, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v8, 0x1000, v4 ; VI-NEXT: v_med3_i32 v9, v9, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v10, v9, v8 ; VI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 +; VI-NEXT: v_or_b32_e32 v11, 1, v10 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 -; VI-NEXT: s_movk_i32 s5, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 +; VI-NEXT: s_movk_i32 s7, 0xfc10 +; VI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, s7, v5 ; VI-NEXT: v_lshlrev_b32_e32 v9, 12, v5 -; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_or_b32_e32 v9, v4, v9 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; VI-NEXT: v_and_b32_e32 v9, 7, v8 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v8, 2, v8 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; VI-NEXT: v_mov_b32_e32 v9, 0x7c00 @@ -4827,9 +4911,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; VI-NEXT: v_mov_b32_e32 v10, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: s_movk_i32 s6, 0x40f +; VI-NEXT: s_movk_i32 s8, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; VI-NEXT: v_and_b32_e32 v8, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 @@ -4839,32 +4923,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v1 +; VI-NEXT: v_sub_u32_e32 v8, vcc, s6, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; VI-NEXT: v_med3_i32 v8, v8, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v11, v8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v11 +; VI-NEXT: v_or_b32_e32 v12, 1, v11 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v5 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v1 -; VI-NEXT: v_or_b32_e32 v5, v11, v5 ; VI-NEXT: v_or_b32_e32 v8, v0, v8 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; VI-NEXT: v_and_b32_e32 v8, 7, v5 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v3 ; VI-NEXT: v_or_b32_e32 v2, v5, v2 @@ -4874,32 +4957,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v3 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; VI-NEXT: v_med3_i32 v5, v5, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v2 ; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; VI-NEXT: v_or_b32_e32 v11, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 ; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v3 -; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_or_b32_e32 v5, v1, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; VI-NEXT: v_and_b32_e32 v5, 7, v2 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v5, v5, v8 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4912,32 +4994,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x1ff -; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4 +; GFX9-NEXT: s_movk_i32 s6, 0x1ff +; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: s_movk_i32 s5, 0xffe +; GFX9-NEXT: s_movk_i32 s7, 0xffe ; GFX9-NEXT: v_bfe_u32 v5, v5, 20, 11 -; GFX9-NEXT: v_and_or_b32 v4, v8, s5, v4 +; GFX9-NEXT: v_and_or_b32 v4, v8, s7, v4 ; GFX9-NEXT: v_sub_u32_e32 v9, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v8, 0x1000, v4 ; GFX9-NEXT: v_med3_i32 v9, v9, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, v9, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc ; GFX9-NEXT: v_lshl_or_b32 v9, v5, 12, v4 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_and_b32_e32 v9, 7, v8 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v8 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7c00 @@ -4945,79 +5026,77 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_movk_i32 s6, 0x40f +; GFX9-NEXT: s_movk_i32 s8, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 -; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0 +; GFX9-NEXT: v_and_or_b32 v0, v5, s7, v0 ; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v8 ; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, v11, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 1, v12 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 0xfffffc10, v8 -; GFX9-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc ; GFX9-NEXT: v_lshl_or_b32 v11, v8, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v11, 7, v5 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_mov_b32 s7, 0x8000 -; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 +; GFX9-NEXT: s_mov_b32 s9, 0x8000 +; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v8, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v8, v8, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, v8, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, v8, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 1, v11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc ; GFX9-NEXT: v_lshl_or_b32 v8, v5, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff @@ -5031,108 +5110,106 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 20, 11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 20, 11 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v5, 20, 11 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v1, 20, 11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v1, 20, 11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v11, 0x3f1, v5 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v8, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v8, 0x3f1, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_med3_i32 v11, v11, 0, 13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v9, v2 -; GFX11-TRUE16-NEXT: v_med3_i32 v8, v8, 0, 13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x1000, v4 +; GFX11-TRUE16-NEXT: v_med3_i32 v10, v10, 0, 13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x1000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v8, v2 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v8, 0x3f1, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, v10, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, v11, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v13, v0 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v13, 0x3f1, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, v8, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x1000, v0 -; GFX11-TRUE16-NEXT: v_med3_i32 v13, v13, 0, 13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_med3_i32 v8, v8, 0, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v15, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v15, 0x3f1, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, v10, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, v8, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_med3_i32 v15, v15, 0, 13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v13, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v9 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v10, 12, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, v13, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v14, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0xfffffc10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x1000, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v9, 12, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v15, v14 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s3, 31, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v13, v17 :: v_dual_and_b32 v13, 7, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0xfffffc10, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, v15, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 12, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 12, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v12, v11 :: v_dual_and_b32 v12, 7, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v15, v14 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v12, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v16, 12, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v16 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v12, v11, s1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 7, v10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 2, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 7, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v11 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 3, v14 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s1, 5, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0x7c00, v8, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v17 :: v_dual_add_nc_u32 v9, v9, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0x7c00, v12, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v8, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5140,7 +5217,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 @@ -5153,123 +5230,114 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v5, 20, 11 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 20, 11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v8 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v3, 20, 11 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v15, 0x3f1, v10 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: v_med3_i32 v9, v9, 0, 13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v5, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v3, 20, 11 +; GFX11-FAKE16-NEXT: v_med3_i32 v15, v15, 0, 13 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x1000, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v8, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x1000, v0 -; GFX11-FAKE16-NEXT: v_med3_i32 v5, v5, 0, 13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v12, v2 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, v9, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0xfffffc10, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, v5, v14 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 1, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v2 ; GFX11-FAKE16-NEXT: v_med3_i32 v12, v12, 0, 13 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v12, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v15, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0xfffffc10, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v13, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0xfffffc10, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, v12, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v5, 12, v4 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 1, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, v15, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 1, v11 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v11, v15 :: v_dual_add_nc_u32 v10, 0xfffffc10, v10 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v8, 12, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v16, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v10, 12, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v10, 12, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v13, 12, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 7, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v14, 12, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v13, v12 :: v_dual_and_b32 v13, 7, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 7, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 7, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v12 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v13 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v18 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7c00, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v12 :: v_dual_add_nc_u32 v11, v11, v15 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, 0x7c00, v12, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v5 ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5 ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc <3 x double> %mag to <3 x half> @@ -5808,28 +5876,27 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v13, v7, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 +; SI-NEXT: s_movk_i32 s6, 0x3f1 ; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 +; SI-NEXT: v_sub_i32_e32 v14, vcc, s6, v13 ; SI-NEXT: v_or_b32_e32 v12, 0x1000, v6 ; SI-NEXT: v_med3_i32 v14, v14, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 +; SI-NEXT: v_or_b32_e32 v16, 1, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v14, v12 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 +; SI-NEXT: s_movk_i32 s7, 0xfc10 +; SI-NEXT: v_cndmask_b32_e32 v12, v15, v16, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 ; SI-NEXT: v_or_b32_e32 v14, v6, v14 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_and_b32_e32 v14, 7, v12 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v14 -; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v14 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; SI-NEXT: v_mov_b32_e32 v14, 0x7c00 @@ -5837,9 +5904,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_movk_i32 s6, 0x40f +; SI-NEXT: s_movk_i32 s8, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v13 ; SI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v5 @@ -5852,32 +5919,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v12, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 +; SI-NEXT: v_sub_i32_e32 v13, vcc, s6, v12 ; SI-NEXT: v_or_b32_e32 v7, 0x1000, v4 ; SI-NEXT: v_med3_i32 v13, v13, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v7 ; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 +; SI-NEXT: v_or_b32_e32 v17, 1, v16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 +; SI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 ; SI-NEXT: v_or_b32_e32 v13, v4, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 ; SI-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc ; SI-NEXT: v_and_b32_e32 v13, 7, v7 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 -; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v13, v13, v16 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v13 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 ; SI-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v12 ; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 @@ -5890,32 +5956,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 ; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v7 +; SI-NEXT: v_sub_i32_e32 v12, vcc, s6, v7 ; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; SI-NEXT: v_med3_i32 v12, v12, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v5 ; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, 1, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 +; SI-NEXT: v_cndmask_b32_e32 v5, v13, v16, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v7 -; SI-NEXT: v_or_b32_e32 v5, v13, v5 ; SI-NEXT: v_or_b32_e32 v12, v2, v12 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc ; SI-NEXT: v_and_b32_e32 v12, 7, v5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v12 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 @@ -5928,32 +5993,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v5 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s6, v5 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 +; SI-NEXT: v_or_b32_e32 v13, 1, v12 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 +; SI-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v5 -; SI-NEXT: v_or_b32_e32 v3, v12, v3 ; SI-NEXT: v_or_b32_e32 v7, v0, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v3 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -5979,28 +6043,27 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_and_b32_e32 v10, 0xffe, v10 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v5, v5, 20, 11 -; VI-NEXT: s_movk_i32 s4, 0x3f1 +; VI-NEXT: s_movk_i32 s6, 0x3f1 ; VI-NEXT: v_or_b32_e32 v4, v10, v4 -; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v5 +; VI-NEXT: v_sub_u32_e32 v11, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x1000, v4 ; VI-NEXT: v_med3_i32 v11, v11, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v10 ; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 +; VI-NEXT: v_or_b32_e32 v13, 1, v12 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v10 -; VI-NEXT: s_movk_i32 s5, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 +; VI-NEXT: s_movk_i32 s7, 0xfc10 +; VI-NEXT: v_cndmask_b32_e32 v10, v12, v13, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, s7, v5 ; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v5 -; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_or_b32_e32 v11, v4, v11 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; VI-NEXT: v_and_b32_e32 v11, 7, v10 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v10, 2, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; VI-NEXT: v_mov_b32_e32 v11, 0x7c00 @@ -6008,9 +6071,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; VI-NEXT: v_mov_b32_e32 v12, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: s_movk_i32 s6, 0x40f +; VI-NEXT: s_movk_i32 s8, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; VI-NEXT: v_and_b32_e32 v10, 0x1ff, v7 ; VI-NEXT: v_or_b32_e32 v6, v10, v6 @@ -6020,32 +6083,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v7, v7, 20, 11 ; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_sub_u32_e32 v10, vcc, s4, v7 +; VI-NEXT: v_sub_u32_e32 v10, vcc, s6, v7 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v5 ; VI-NEXT: v_med3_i32 v10, v10, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v13, v10, v6 ; VI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 +; VI-NEXT: v_or_b32_e32 v14, 1, v13 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, s7, v7 ; VI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 -; VI-NEXT: v_or_b32_e32 v6, v13, v6 ; VI-NEXT: v_or_b32_e32 v10, v5, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; VI-NEXT: v_and_b32_e32 v10, 7, v6 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v10, v10, v13 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 ; VI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v7, v0 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -6055,32 +6117,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v1 +; VI-NEXT: v_sub_u32_e32 v7, vcc, s6, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v0 ; VI-NEXT: v_med3_i32 v7, v7, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v10, v7, v6 ; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; VI-NEXT: v_or_b32_e32 v13, 1, v10 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v6, v10, v13, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 ; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v1 -; VI-NEXT: v_or_b32_e32 v6, v10, v6 ; VI-NEXT: v_or_b32_e32 v7, v0, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; VI-NEXT: v_and_b32_e32 v7, 7, v6 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v7, v7, v10 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 ; VI-NEXT: v_or_b32_e32 v2, v6, v2 @@ -6090,32 +6151,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3 +; VI-NEXT: v_sub_u32_e32 v6, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v2 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; VI-NEXT: v_or_b32_e32 v10, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 -; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v6, v1, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v2 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -6131,32 +6191,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x1ff -; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4 +; GFX9-NEXT: s_movk_i32 s6, 0x1ff +; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: s_movk_i32 s5, 0xffe +; GFX9-NEXT: s_movk_i32 s7, 0xffe ; GFX9-NEXT: v_bfe_u32 v11, v5, 20, 11 -; GFX9-NEXT: v_and_or_b32 v4, v10, s5, v4 +; GFX9-NEXT: v_and_or_b32 v4, v10, s7, v4 ; GFX9-NEXT: v_sub_u32_e32 v12, 0x3f1, v11 ; GFX9-NEXT: v_or_b32_e32 v10, 0x1000, v4 ; GFX9-NEXT: v_med3_i32 v12, v12, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, v12, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 1, v13 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v11, 0xfffffc10, v11 -; GFX9-NEXT: v_or_b32_e32 v10, v13, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc ; GFX9-NEXT: v_lshl_or_b32 v12, v11, 12, v4 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v12, 7, v10 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v12 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 2, v10 ; GFX9-NEXT: v_add_u32_e32 v10, v10, v12 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7c00 @@ -6164,115 +6223,112 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_movk_i32 s6, 0x40f +; GFX9-NEXT: s_movk_i32 s8, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: s_mov_b32 s7, 0x8000 -; GFX9-NEXT: v_and_or_b32 v4, v5, s7, v4 -; GFX9-NEXT: v_and_or_b32 v5, v7, s4, v6 +; GFX9-NEXT: s_mov_b32 s9, 0x8000 +; GFX9-NEXT: v_and_or_b32 v4, v5, s9, v4 +; GFX9-NEXT: v_and_or_b32 v5, v7, s6, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v7 ; GFX9-NEXT: v_bfe_u32 v10, v7, 20, 11 -; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 +; GFX9-NEXT: v_and_or_b32 v5, v6, s7, v5 ; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v10 ; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v5 ; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, v11, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 1, v14 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v10, 0xfffffc10, v10 -; GFX9-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc ; GFX9-NEXT: v_lshl_or_b32 v11, v10, 12, v5 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v11, 7, v6 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v11 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 -; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v10 +; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_and_or_b32 v5, v6, s7, v5 +; GFX9-NEXT: v_and_or_b32 v5, v6, s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v6, s5, v0 +; GFX9-NEXT: v_and_or_b32 v0, v6, s7, v0 ; GFX9-NEXT: v_sub_u32_e32 v10, 0x3f1, v7 ; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v10, v10, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, v10, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_or_b32_e32 v14, 1, v11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 0xfffffc10, v7 -; GFX9-NEXT: v_or_b32_e32 v6, v11, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v14, vcc ; GFX9-NEXT: v_lshl_or_b32 v10, v7, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 7, v6 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 +; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v6, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, v7, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6 -; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc ; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff @@ -6288,158 +6344,149 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 20, 11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v7 ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v5 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 20, 11 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 20, 11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v3, 20, 11 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 20, 11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v7.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0xffe, v11, v6 ; GFX11-TRUE16-NEXT: v_med3_i32 v11, v12, 0, 13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v15, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x1000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v14, 0x3f1, v17 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v10, 12, v6 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v16 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v14, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v10, 12, v6 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v16, v4 -; GFX11-TRUE16-NEXT: v_med3_i32 v14, v14, 0, 13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v21, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_med3_i32 v12, v12, 0, 13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v3, 20, 11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, 0x7e00 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x1000, v4 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v21, 0x3f1, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 7, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, v12, v15 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-TRUE16-NEXT: v_med3_i32 v21, v21, 0, 13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v18, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, v14, v22 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v12, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, v12, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 1, v19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v18 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v18, 0x3f1, v17 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v14, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v16, 12, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0xfffffc10, v17 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v21, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x1000, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v19, v20, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v23 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, v14, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x1000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, v21, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, 0x7c00, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v16 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v14, v12 :: v_dual_add_nc_u32 v11, v11, v21 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v18, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_med3_i32 v14, v18, 0, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_and_b32 v18, 7, v12 ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0xfffffc10, v17 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, v14, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v17, 12, v4 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v21, v22 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 20, 11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, v14, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 1, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, v14, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 12, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v10, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, v19, v21, s0 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0xfffffc10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x1000, v0 ; GFX11-TRUE16-NEXT: v_med3_i32 v10, v10, 0, 13 -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x8000, v15, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v14, 12, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, v10, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v22, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, v15, v14, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, v10, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 7, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 2, v14 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x8000, v20, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, v10, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 7, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0xfffffc10, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 1, v7 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v11, 12, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v16, 12, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v10, v7 :: v_dual_add_nc_u32 v10, v12, v18 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v17 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v10, v7, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v15 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0x40f, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 2, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v12 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v18, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v10, v4, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x8000, v15, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 31, v17 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x8000, v20, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v13 :: v_dual_add_nc_u32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7c00, v12, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v15, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v11 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v20, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v4, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v15, v0 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v20, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 @@ -6455,150 +6502,144 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v7 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v7, 20, 11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v7, 20, 11 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v3, 20, 11 ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v11, v4 ; GFX11-FAKE16-NEXT: v_med3_i32 v11, v12, 0, 13 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x1000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffe, v13, v6 -; GFX11-FAKE16-NEXT: v_med3_i32 v13, v17, 0, 13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v11, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s3, 0, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, v11, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 1, v15 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v14, v11 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v13, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v10, 12, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v10, 12, v4 +; GFX11-FAKE16-NEXT: v_med3_i32 v15, v17, 0, 13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v6 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, v13, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v18, 0x3f1, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, v15, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v1, 20, 11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, v15, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 7, v11 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 1, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-FAKE16-NEXT: v_med3_i32 v18, v18, 0, 13 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v16, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v14, 12, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s1, v15, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v14, 12, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, v17, v21, s1 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v3, 20, 11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v15 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, v18, v16 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v17, v2 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0xfffffc10, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_lshlrev_b32 v18, v18, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 7, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x1000, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0xfffffc10, v20 ; GFX11-FAKE16-NEXT: v_med3_i32 v17, v17, 0, 13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 2, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, v17, v21 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v18, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, 0x7e00 :: v_dual_lshlrev_b32 v17, v17, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v17, v21 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v23, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0xfffffc10, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v12, 12, v0 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v20, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v19, 12, v2 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v18 :: v_dual_and_b32 v15, 7, v16 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v12, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v13 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v18, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x1000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_med3_i32 v12, v12, 0, 13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0xfffffc10, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, v12, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 1, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 2, v15 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, v17, v16 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v13, 12, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, v17, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 1, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v21, v24, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v17, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v20, 12, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v23, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 7, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v16 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v17 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0x7c00, v19, s3 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 2, v16 ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x8000, v5, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 2, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 2, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v15, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v20 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7c00, v18, vcc_lo +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v12 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v16, v15 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v20 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x8000, v7, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 462d7748b86cd..af0c38c5624ba 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3944,9 +3944,10 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 20, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 -; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_and_b32_e32 v3, 0x7ff, v3 ; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 @@ -3954,21 +3955,20 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -3994,9 +3994,10 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 20, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2 -; VI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; VI-NEXT: v_and_b32_e32 v3, 0x7ff, v3 ; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v3 @@ -4004,21 +4005,20 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4041,47 +4041,47 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 20, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0x7ff, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 +; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x3f1, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xfffffc10, v2 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_med3_i32 v3, v3, 0, 13 +; GFX11-NEXT: v_lshl_or_b32 v7, v2, 12, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v4, 7, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fpround = fptrunc double %a to half @@ -4106,21 +4106,20 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4154,21 +4153,20 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4202,35 +4200,32 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 -; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2 ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg double %a @@ -4258,21 +4253,20 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_med3_i32 v5, v5, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 +; SI-NEXT: v_or_b32_e32 v7, 1, v6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v5, v0, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_and_b32_e32 v5, 7, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 @@ -4310,21 +4304,20 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; VI-NEXT: v_or_b32_e32 v8, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 -; VI-NEXT: v_or_b32_e32 v4, v7, v4 ; VI-NEXT: v_or_b32_e32 v6, v0, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v4 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 @@ -4359,32 +4352,28 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6 +; GFX11-NEXT: v_or_b32_e32 v7, 1, v6 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4 -; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0xfffffc10, v4 +; GFX11-NEXT: v_lshl_or_b32 v8, v4, 12, v2 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_cndmask_b32 v3, v8, v3 ; GFX11-NEXT: v_and_b32_e32 v5, 7, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v5 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v6 :: v_dual_add_nc_u32 v3, v3, v5 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo @@ -4521,21 +4510,20 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 +; SI-NEXT: v_or_b32_e32 v9, 1, v8 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v7, v4, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -4572,22 +4560,21 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; VI-NEXT: v_med3_i32 v7, v7, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 +; VI-NEXT: v_or_b32_e32 v9, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 ; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; VI-NEXT: v_or_b32_e32 v5, v8, v5 ; VI-NEXT: v_or_b32_e32 v7, v4, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; VI-NEXT: v_and_b32_e32 v7, 7, v5 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] -; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -4625,28 +4612,27 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v8, v5, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v9, 1, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v10, v6, 12, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 7, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v5 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX11-NEXT: v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v5 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 @@ -4681,21 +4667,20 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4731,21 +4716,20 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 -; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 +; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4780,36 +4764,34 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 -; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2 ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index e687745469014..40b33f48f4813 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 -; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_xor_b32 s0, s0, 0x80008000 ; SI-NEXT: s_cmp_eq_u32 s1, 1 +; SI-NEXT: s_cselect_b32 s0, 0x80008000, s0 ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -4358,9 +4358,10 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 1, v2 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0x80008000 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 1b092b283290a..c20b99444ab35 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: select_fneg_xor_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i32 %arg0, -2147483648 %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0 @@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: select_fneg_xor_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i64 %arg0, 9223372036854775808 %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0 @@ -936,10 +925,8 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) { ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; GCN-NEXT: v_bfrev_b32_e32 v2, 1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, -v1, vcc ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -947,14 +934,14 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v1, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %i = and i32 %arg, 1 %i3 = icmp eq i32 %i, 0 @@ -1015,12 +1002,11 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) { ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 49c563eef5d82..d99cf35c482a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -111,34 +111,36 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 ; SI-NEXT: v_readfirstlane_b32 s1, v0 ; SI-NEXT: s_sub_i32 s6, 0x3f1, s0 -; SI-NEXT: s_or_b32 s1, s8, s1 +; SI-NEXT: s_or_b32 s10, s8, s1 ; SI-NEXT: v_med3_i32 v0, s6, 0, 13 -; SI-NEXT: s_or_b32 s6, s1, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: s_lshr_b32 s9, s6, s8 -; SI-NEXT: s_lshl_b32 s8, s9, s8 -; SI-NEXT: s_cmp_lg_u32 s8, s6 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_addk_i32 s0, 0xfc10 -; SI-NEXT: s_or_b32 s6, s9, s6 -; SI-NEXT: s_lshl_b32 s8, s0, 12 -; SI-NEXT: s_or_b32 s8, s1, s8 -; SI-NEXT: s_cmp_lt_i32 s0, 1 -; SI-NEXT: s_cselect_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s6, 7 -; SI-NEXT: s_cmp_gt_i32 s8, 5 -; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_or_b32 s1, s10, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_lshr_b32 s8, s1, s6 +; SI-NEXT: s_or_b32 s9, s8, 1 +; SI-NEXT: s_lshl_b32 s6, s8, s6 +; SI-NEXT: s_cmp_lg_u32 s6, s1 +; SI-NEXT: s_cselect_b32 s1, s9, s8 +; SI-NEXT: s_add_i32 s6, s0, 0xfffffc10 +; SI-NEXT: s_lshl_b32 s0, s6, 12 +; SI-NEXT: s_or_b32 s0, s10, s0 +; SI-NEXT: s_cmp_lt_i32 s6, 1 +; SI-NEXT: s_cselect_b32 s11, s1, s0 +; SI-NEXT: s_and_b32 s8, s11, 7 ; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: s_cselect_b32 s8, 1, 0 -; SI-NEXT: s_lshr_b32 s6, s6, 2 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s6, s6, s8 -; SI-NEXT: s_cmp_lt_i32 s0, 31 -; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 1, 0 +; SI-NEXT: s_lshr_b32 s1, s11, 2 +; SI-NEXT: s_add_i32 s1, s1, s0 +; SI-NEXT: s_cmp_lt_i32 s6, 31 +; SI-NEXT: s_cselect_b32 s0, s1, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s10, 0 ; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f -; SI-NEXT: s_cselect_b32 s0, s1, s6 +; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f +; SI-NEXT: s_cselect_b32 s0, s1, s0 ; SI-NEXT: s_lshr_b32 s1, s7, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s6, s1, s0 @@ -165,37 +167,39 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 +; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014 +; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4 +; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5 ; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000 ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s4, s8 +; VI-SAFE-SDAG-NEXT: s_or_b32 s10, s9, 1 ; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 -; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s4 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s10, s9 +; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s5, 0xfffffc10 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s5, s10, 12 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s4, s5 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s11, 7 ; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-SAFE-SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; VI-SAFE-SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, 1, 0 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s11, 2 +; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s4 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s5, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; VI-SAFE-SDAG-NEXT: s_movk_i32 s5, 0x7e00 ; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 +; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s5, s4 ; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 ; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 ; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 @@ -296,21 +300,23 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s8, s7, 1 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s8, s7 ; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 ; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s6, exec_lo ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 ; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 @@ -425,23 +431,26 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s8, s7, 1 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s8, s7 ; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s6, exec_lo ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll index d8f21d285ddff..27e5b521ae8c3 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll @@ -284,91 +284,85 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0x1ff -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s0, v0 +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0x1ff +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s2, v0 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX950-SDAG-NEXT: s_movk_i32 s1, 0xffe +; GFX950-SDAG-NEXT: s_movk_i32 s3, 0xffe ; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-SDAG-NEXT: v_bfe_u32 v5, v1, 20, 11 -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v4, s1, v0 +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v4, s3, v0 ; GFX950-SDAG-NEXT: v_sub_u32_e32 v6, 0x3f1, v5 ; GFX950-SDAG-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX950-SDAG-NEXT: v_med3_i32 v6, v6, 0, 13 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v7, v6, v4 ; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; GFX950-SDAG-NEXT: v_or_b32_e32 v8, 1, v7 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; GFX950-SDAG-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 ; GFX950-SDAG-NEXT: v_lshl_or_b32 v6, v5, 12, v0 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX950-SDAG-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 -; GFX950-SDAG-NEXT: s_movk_i32 s2, 0x40f -; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x40f ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX950-SDAG-NEXT: v_and_b32_e32 v6, 7, v4 -; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 -; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; GFX950-SDAG-NEXT: s_mov_b32 s3, 0x8000 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX950-SDAG-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v6 +; GFX950-SDAG-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX950-SDAG-NEXT: v_add_u32_e32 v4, v4, v6 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0x7c00 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-SDAG-NEXT: s_mov_b32 s5, 0x8000 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s0, v2 +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s2, v2 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX950-SDAG-NEXT: v_bfe_u32 v4, v3, 20, 11 ; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s1, v1 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX950-SDAG-NEXT: v_sub_u32_e32 v5, 0x3f1, v4 ; GFX950-SDAG-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX950-SDAG-NEXT: v_med3_i32 v5, v5, 0, 13 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v8, v5, v2 ; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; GFX950-SDAG-NEXT: v_or_b32_e32 v9, 1, v8 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 ; GFX950-SDAG-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4 ; GFX950-SDAG-NEXT: v_lshl_or_b32 v5, v4, 12, v1 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX950-SDAG-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 -; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX950-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; GFX950-SDAG-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v4 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX950-SDAG-NEXT: v_perm_b32 v0, v1, v0, s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 9389f1614721f..a841f7ffa02b9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -51,160 +51,314 @@ bb: ; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { -; GFX11-LABEL: f2: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s20, 0 -; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s3, exec_lo -; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB2_13 -; GFX11-NEXT: ; %bb.1: ; %bb14 -; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c -; GFX11-NEXT: s_mov_b32 s18, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s24, -1, 0 -; GFX11-NEXT: s_bitcmp0_b32 s21, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 -; GFX11-NEXT: ; %bb.2: ; %bb15 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s21, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_mov_b32 s14, s21 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-NEXT: s_branch .LBB2_12 -; GFX11-NEXT: .LBB2_3: -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 -; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s0, s[16:17], 0x54 -; GFX11-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-NEXT: s_and_b32 s1, s23, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 -; GFX11-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 -; GFX11-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s30 -; GFX11-NEXT: s_mul_i32 s0, s0, s22 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s0, s0, s20 -; GFX11-NEXT: s_or_b32 s0, s19, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB2_6: ; %bb18 -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_and_b32 s1, s8, s1 -; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 -; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s13, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB2_6 -; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB2_8: ; %Flow12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB2_12 -; GFX11-NEXT: ; %bb.9: -; GFX11-NEXT: s_xor_b32 s0, s8, -1 -; GFX11-NEXT: .LBB2_10: ; %bb17 -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB2_10 -; GFX11-NEXT: ; %bb.11: ; %Flow6 -; GFX11-NEXT: s_mov_b32 s18, -1 -; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s20, s2, exec_lo -; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo -; GFX11-NEXT: .LBB2_13: ; %Flow9 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s3, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_15 -; GFX11-NEXT: ; %bb.14: ; %bb43 -; GFX11-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s14, s15 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s20, s20, exec_lo -; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s0, s20 -; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock -; GFX11-NEXT: ; divergent unreachable -; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock -; GFX11-NEXT: s_endpgm + +; GFX11-TRUE16-LABEL: f2: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-TRUE16-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 +; GFX11-TRUE16-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-TRUE16-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-TRUE16-NEXT: s_mov_b32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb14 +; GFX11-TRUE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c +; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s21, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-TRUE16-NEXT: s_bitcmp0_b32 s21, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %bb15 +; GFX11-TRUE16-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-TRUE16-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s14 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s15 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-TRUE16-NEXT: s_branch .LBB2_12 +; GFX11-TRUE16-NEXT: .LBB2_3: +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB2_4: ; %bb16 +; GFX11-TRUE16-NEXT: s_load_b32 s1, s[16:17], 0x54 +; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s23, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, -1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s23, 1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX11-TRUE16-NEXT: ; %bb.5: ; %bb18.preheader +; GFX11-TRUE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s8, s29, s28 +; GFX11-TRUE16-NEXT: s_mul_i32 s9, s29, s28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s9, 1 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, 0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, s30 +; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s20 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b64 s[20:21], s[8:9], 1 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 +; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[20:21] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB2_6: ; %bb18 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s8, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s1, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s19, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s13, 0xffff, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 1 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s19, s13 +; GFX11-TRUE16-NEXT: s_or_b32 s19, s9, 0x100 +; GFX11-TRUE16-NEXT: s_and_b32 s13, 1, s13 +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s13, 1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s19, s9 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_6 +; GFX11-TRUE16-NEXT: ; %bb.7: ; %Flow +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: .LBB2_8: ; %Flow12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_12 +; GFX11-TRUE16-NEXT: ; %bb.9: +; GFX11-TRUE16-NEXT: s_xor_b32 s1, s1, -1 +; GFX11-TRUE16-NEXT: .LBB2_10: ; %bb17 +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_10 +; GFX11-TRUE16-NEXT: ; %bb.11: ; %Flow6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: .LBB2_12: ; %Flow11 +; GFX11-TRUE16-NEXT: s_and_b32 s20, s0, exec_lo +; GFX11-TRUE16-NEXT: s_or_not1_b32 s0, s18, exec_lo +; GFX11-TRUE16-NEXT: .LBB2_13: ; %Flow9 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s3, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_15 +; GFX11-TRUE16-NEXT: ; %bb.14: ; %bb43 +; GFX11-TRUE16-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-TRUE16-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s14 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s15 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-TRUE16-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-TRUE16-NEXT: .LBB2_15: ; %Flow14 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-TRUE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GFX11-TRUE16-NEXT: ; divergent unreachable +; GFX11-TRUE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-TRUE16-NEXT: s_endpgm +; GFX11-FAKE16-LABEL: f2: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-FAKE16-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s13 +; GFX11-FAKE16-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-FAKE16-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-FAKE16-NEXT: s_mov_b32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb14 +; GFX11-FAKE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c +; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s21, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-FAKE16-NEXT: s_bitcmp0_b32 s21, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb15 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s14 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s14 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s15 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s21 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-FAKE16-NEXT: s_branch .LBB2_12 +; GFX11-FAKE16-NEXT: .LBB2_3: +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB2_12 +; GFX11-FAKE16-NEXT: .LBB2_4: ; %bb16 +; GFX11-FAKE16-NEXT: s_load_b32 s0, s[16:17], 0x54 +; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s23, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s23, 1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX11-FAKE16-NEXT: ; %bb.5: ; %bb18.preheader +; GFX11-FAKE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mul_hi_u32 s0, s29, s28 +; GFX11-FAKE16-NEXT: s_mul_i32 s1, s29, s28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, s30 +; GFX11-FAKE16-NEXT: s_mul_i32 s0, s0, s22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_mul_i32 s0, s0, s20 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s19, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s1 +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[20:21] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB2_6: ; %bb18 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s8, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s19, s13 +; GFX11-FAKE16-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 1 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s20, s9, exec_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s19, s13 +; GFX11-FAKE16-NEXT: s_or_b32 s19, s0, 0x100 +; GFX11-FAKE16-NEXT: s_and_b32 s13, 1, s13 +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s13, 1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s19, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_6 +; GFX11-FAKE16-NEXT: ; %bb.7: ; %Flow +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB2_8: ; %Flow12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_12 +; GFX11-FAKE16-NEXT: ; %bb.9: +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-FAKE16-NEXT: .LBB2_10: ; %bb17 +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_10 +; GFX11-FAKE16-NEXT: ; %bb.11: ; %Flow6 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: .LBB2_12: ; %Flow11 +; GFX11-FAKE16-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-FAKE16-NEXT: s_or_not1_b32 s0, s18, exec_lo +; GFX11-FAKE16-NEXT: .LBB2_13: ; %Flow9 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s3, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_15 +; GFX11-FAKE16-NEXT: ; %bb.14: ; %bb43 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s14 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s15 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-FAKE16-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-FAKE16-NEXT: .LBB2_15: ; %Flow14 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-FAKE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GFX11-FAKE16-NEXT: ; divergent unreachable +; GFX11-FAKE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-FAKE16-NEXT: s_endpgm + bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i12 = mul i32 %arg, %i diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index c3ce0d1aa739e..c13f1cdd23d36 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -340,15 +340,16 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -361,15 +362,16 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -382,16 +384,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_1_fabs_2_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = xor i64 %a, u0x8000000000000000 %abs.b = and i64 %b, u0x7fffffffffffffff @@ -405,16 +407,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i64 %a, u0x7fffffffffffffff %cmp = icmp eq i64 %cond, zeroinitializer @@ -427,16 +429,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fabs_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = and i64 %a, u0x7fffffffffffffff %cmp = icmp eq i64 %cond, zeroinitializer @@ -449,16 +451,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer @@ -471,16 +473,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_fabs_select_i64_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.a = or i64 %a, u0x8000000000000000 %cmp = icmp eq i64 %cond, zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 4e27cf20d3c98..c52f7a4ac720a 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -124,9 +124,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i32: @@ -136,9 +135,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i32: @@ -383,16 +381,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i32: @@ -402,16 +398,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i32: @@ -442,8 +436,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i64: @@ -456,8 +449,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i64: @@ -470,8 +462,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i64: @@ -480,12 +471,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_i64: @@ -494,11 +484,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll index 2c7819a395c86..2549e76821e1c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -8,11 +8,10 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v2, 1, v2 +; CHECK-NEXT: v_or_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 8 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %sext = sext i8 %x to i32 %select = select i1 %cond, i32 %sext, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 40d80f5e83e36..09c0e775f783d 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -124,9 +124,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i32: @@ -136,9 +135,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i32: @@ -383,16 +381,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i32: @@ -402,16 +398,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i32: @@ -439,23 +433,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i32: @@ -465,23 +456,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i32: @@ -511,30 +499,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i32: @@ -544,30 +528,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i32: @@ -599,58 +579,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v8i32: @@ -660,58 +632,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i32: @@ -751,116 +715,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: @@ -870,116 +818,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v16i32: @@ -1066,8 +998,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i64: @@ -1080,8 +1011,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i64: @@ -1094,8 +1024,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i64: @@ -1104,12 +1033,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_i64: @@ -1118,11 +1046,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result From 4c79cafe4a4831dbe501f3757bf05c687459ea66 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 19:08:18 -0500 Subject: [PATCH 19/29] Remove dead code that was moved to the TI DAGCombiner --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 66 ------------------- 1 file changed, 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 7436de2d6a6a8..b635f27c56979 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4842,54 +4842,6 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -static EVT getFloatVT(EVT VT) { - EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits()); - return VT.isVector() ? VT.changeVectorElementType(FT) : FT; -} - -static SDValue getBitwiseToSrcModifierOp(SDValue N, - TargetLowering::DAGCombinerInfo &DCI) { - - unsigned Opc = N.getNode()->getOpcode(); - if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - ConstantSDNode *CRHS = isConstOrConstSplat(RHS); - - if (!CRHS) - return SDValue(); - - EVT VT = RHS.getValueType(); - EVT FVT = getFloatVT(VT); - SDLoc SL = SDLoc(N); - - switch (Opc) { - case ISD::XOR: - if (CRHS->getAPIntValue().isSignMask()) - return DAG.getNode(ISD::FNEG, SL, FVT, - DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); - break; - case ISD::OR: - if (CRHS->getAPIntValue().isSignMask()) { - SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, - DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); - return DAG.getNode(ISD::FNEG, SL, FVT, Abs); - } - break; - case ISD::AND: - if (CRHS->getAPIntValue().isMaxSignedValue()) - return DAG.getNode(ISD::FABS, SL, FVT, - DAG.getNode(ISD::BITCAST, SL, FVT, LHS)); - break; - default: - return SDValue(); - } - return SDValue(); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) @@ -4930,24 +4882,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, // DCI.AddToWorklist(MinMax.getNode()); return MinMax; } - - auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue { - SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI); - SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI); - if (SrcModTrue || SrcModFalse) { - SDLoc SL(N); - EVT FVT = - SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); - SDValue FLHS = - SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS); - SDValue FRHS = - SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); - SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS); - return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); - } - return SDValue(); - }; - } // There's no reason to not do this if the condition has other uses. From c265ed4a57c2cf8b27cb11c651c56dde1ea6475a Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 19:22:59 -0500 Subject: [PATCH 20/29] Canonicalise TI select operand variable names and update tests --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 225 ++++++------------ 2 files changed, 83 insertions(+), 163 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4f58ffa47fd20..5243cebbdd05a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12440,18 +12440,17 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) return R; - auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue { - SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS); - SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS); - if (SrcModTrue || SrcModFalse) { + auto FoldSrcMods = [&](SDValue N1, SDValue N2, EVT VT) -> SDValue { + SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1); + SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2); + if (SrcModN1 || SrcModN2) { SDLoc SL(N); - EVT FVT = - SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType(); - SDValue FLHS = - SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS); - SDValue FRHS = - SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS); - SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS); + EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType(); + SDValue FN1 = + SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1); + SDValue FN2 = + SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2); + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2); return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); } return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index d52fe845d62ec..606f6d1e3939b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -275,23 +275,14 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) { ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: s_test_copysign_f16_10_mag: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x8000 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, 0x4900 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: s_test_copysign_f16_10_mag: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x8000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, 0x4900 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: s_test_copysign_f16_10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 0x4900 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half 10.0, half %sign) %cast = bitcast half %result to i16 ret i16 %cast @@ -1199,120 +1190,62 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 -; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 -; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s0 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffe -; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s3, s1, 0x1000 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s5, 1 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, s3 -; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s6, s5 -; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s0, 12 -; GFX11-TRUE16-NEXT: s_or_b32 s4, s1, s4 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 1 -; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 7 -; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 -; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s4, 5 -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, exec_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, s4 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 31 -; GFX11-TRUE16-NEXT: s_movk_i32 s4, 0x7e00 -; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 -; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s0, 0x40f -; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-TRUE16-NEXT: ; return to shader part epilog -; -; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 -; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 -; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s0 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffe -; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3 -; GFX11-FAKE16-NEXT: s_or_b32 s3, s1, 0x1000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s5, 1 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, s3 -; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s6, s5 -; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s0, 12 -; GFX11-FAKE16-NEXT: s_or_b32 s4, s1, s4 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 1 -; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 7 -; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-FAKE16-NEXT: s_cselect_b32 s5, -1, 0 -; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s4, 5 -; GFX11-FAKE16-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, exec_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, s4 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 31 -; GFX11-FAKE16-NEXT: s_movk_i32 s4, 0x7e00 -; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 -; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s0, 0x40f -; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, s3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-FAKE16-NEXT: ; return to shader part epilog +; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s3, s1, 0x1ff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014 +; GFX11-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffe +; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s5, s3, s4 +; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_or_b32 s6, s5, 1 +; GFX11-NEXT: s_cmp_lg_u32 s4, s3 +; GFX11-NEXT: s_cselect_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0xfc10 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s0, 12 +; GFX11-NEXT: s_or_b32 s4, s1, s4 +; GFX11-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-NEXT: s_cselect_b32 s3, s3, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s4, s3, 7 +; GFX11-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-NEXT: s_cselect_b32 s5, -1, 0 +; GFX11-NEXT: s_cmp_gt_i32 s4, 5 +; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s3, s3, s4 +; GFX11-NEXT: s_cmp_lt_i32 s0, 31 +; GFX11-NEXT: s_movk_i32 s4, 0x7e00 +; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, s4, 0x7c00 +; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x40f +; GFX11-NEXT: s_cselect_b32 s0, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) %cast = bitcast half %result to i16 @@ -4418,27 +4351,15 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_mov_b32 s1, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-TRUE16-NEXT: ; return to shader part epilog -; -; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s2, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-FAKE16-NEXT: ; return to shader part epilog +; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %sign.trunc = fptrunc <2 x double> %sign to <2 x half> %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc) %cast = bitcast <2 x half> %out to i32 From d658adbc1349f5c7df030ae205dbbed4b8c9593b Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 19:28:04 -0500 Subject: [PATCH 21/29] Fix missed clang-format --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5243cebbdd05a..3e201deaaf3ff 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12189,7 +12189,8 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS)) + if (!TLI.shouldFoldSelectWithIdentityConstant( + N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS)) return SDValue(); ConstantSDNode *CRHS = isConstOrConstSplat(RHS); From 9fb7344c197ef7bfbc81282de9d18ad87fe482cf Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 19:32:45 -0500 Subject: [PATCH 22/29] Suppress overzealous clang-format --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b635f27c56979..0ede2a9783461 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4876,10 +4876,10 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { - SDValue MinMax = - combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + SDValue MinMax + = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); // Revisit this node so we can catch min3/max3/med3 patterns. - // DCI.AddToWorklist(MinMax.getNode()); + //DCI.AddToWorklist(MinMax.getNode()); return MinMax; } } From 29d9b3d2272c1ec37bb2cad00b62b1e21aae445f Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sat, 12 Jul 2025 19:33:36 -0500 Subject: [PATCH 23/29] Suppress overzealous clang-format --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0ede2a9783461..e64d2162441ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4877,7 +4877,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { SDValue MinMax - = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); // Revisit this node so we can catch min3/max3/med3 patterns. //DCI.AddToWorklist(MinMax.getNode()); return MinMax; From cd5c7329bc074e71ff0fdad8f1ae25a99158620a Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sun, 13 Jul 2025 04:03:42 -0500 Subject: [PATCH 24/29] Remove unnecessary lambda and refactor foldSelectOfSourceMods() to fit TI DAGCombiner style. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3e201deaaf3ff..ba4767bdec1b4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -602,6 +602,7 @@ namespace { SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldSelectOfBinops(SDNode *N); + SDValue foldSelectOfSourceMods(SDNode *N); SDValue foldSextSetcc(SDNode *N); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL); @@ -684,7 +685,6 @@ namespace { SDValue VecIn2, unsigned LeftIdx, bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); - SDValue getBitwiseToSrcModifierOp(SDValue N); /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, @@ -12175,12 +12175,7 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } -static EVT getFloatVT(EVT VT) { - EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits()); - return VT.isVector() ? VT.changeVectorElementType(FT) : FT; -} - -SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) { +static SDValue getBitwiseToSrcModifierOp(SDValue N, SelectionDAG &DAG) { unsigned Opc = N.getNode()->getOpcode(); if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR) @@ -12189,17 +12184,18 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.shouldFoldSelectWithIdentityConstant( N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS)) return SDValue(); ConstantSDNode *CRHS = isConstOrConstSplat(RHS); - if (!CRHS) return SDValue(); EVT VT = RHS.getValueType(); - EVT FVT = getFloatVT(VT); + EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits()); + EVT FVT = VT.isVector() ? VT.changeVectorElementType(FT) : FT; SDLoc SL = SDLoc(N); switch (Opc) { @@ -12226,6 +12222,24 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) { return SDValue(); } +SDValue DAGCombiner::foldSelectOfSourceMods(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); + SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1, DAG); + SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2, DAG); + if (SrcModN1 || SrcModN2) { + SDLoc SL(N); + EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType(); + SDValue FN1 = SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1); + SDValue FN2 = SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2); + SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2); + return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); + } + return SDValue(); +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12441,27 +12455,10 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) return R; - auto FoldSrcMods = [&](SDValue N1, SDValue N2, EVT VT) -> SDValue { - SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1); - SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2); - if (SrcModN1 || SrcModN2) { - SDLoc SL(N); - EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType(); - SDValue FN1 = - SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1); - SDValue FN2 = - SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2); - SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2); - return DAG.getNode(ISD::BITCAST, SL, VT, FSelect); - } - return SDValue(); - }; - // Identify bitmask operations that are source mods and create // the relevant fneg, fabs or fneg+fabs. - if (VT == MVT::i32 || VT == MVT::v2i32) - if (SDValue F = FoldSrcMods(N1, N2, VT)) - return F; + if (SDValue F = foldSelectOfSourceMods(N)) + return F; return SDValue(); } From ec42e07d01cc84f4d4a435c2e190a25695eacdd4 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Sun, 13 Jul 2025 06:35:38 -0500 Subject: [PATCH 25/29] [NFC] Minor corrections to whitespace and test name --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 1 + llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ba4767bdec1b4..85585472881a6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -685,6 +685,7 @@ namespace { SDValue VecIn2, unsigned LeftIdx, bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); + /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll index c13f1cdd23d36..beab27ca97126 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll @@ -222,8 +222,8 @@ define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> % ret <2 x i32> %select } -define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { -; GCN-LABEL: fneg_select_v2i32: +define <2 x i32> @fneg_1_fabs_2_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_1_fabs_2_select_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -232,7 +232,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) ; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_select_v2i32: +; GFX11-LABEL: fneg_1_fabs_2_select_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 From 4bd51d01444f925de88274e185de3be53e0c76b8 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 14 Jul 2025 08:48:16 -0500 Subject: [PATCH 26/29] Add tighter constraints to apply combine and update tests. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++++ .../atomic_optimizations_global_pointer.ll | 18 ++++++++---------- .../branch-folding-implicit-def-subreg.ll | 18 +++++++++--------- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 7 +++---- .../AMDGPU/sdwa-peephole-cndmask-sext.ll | 7 ++++--- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 77632c1423f4e..5180c3805550d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15496,6 +15496,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, bool SITargetLowering::shouldFoldSelectWithIdentityConstant( unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const { + + if (BinOpcode != ISD::AND && BinOpcode != ISD::OR && BinOpcode != ISD::XOR) + return false; + + ConstantSDNode *CY = isConstOrConstSplat(Y); + if (!CY) + return false; + return (BinOpcode == ISD::AND || BinOpcode == ISD::OR || BinOpcode == ISD::XOR) && (VT.getScalarType() == MVT::i32); diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 7584d3eb12928..3ca7db155b385 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7145,13 +7145,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: s_or_b32 s5, s4, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -8839,13 +8838,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: s_or_b32 s5, s4, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 50efed6da381b..92c63fead15ac 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec - ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec + ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 40b33f48f4813..e687745469014 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -4318,9 +4318,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_xor_b32 s0, s0, 0x80008000 ; SI-NEXT: s_cmp_eq_u32 s1, 1 -; SI-NEXT: s_cselect_b32 s0, 0x80008000, s0 ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -4358,10 +4358,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 1, v2 -; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; SI-NEXT: v_mov_b32_e32 v2, 0x80008000 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll index 2549e76821e1c..2c7819a395c86 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll @@ -8,10 +8,11 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v2, 1, v2 -; CHECK-NEXT: v_or_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 8 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %sext = sext i8 %x to i32 %select = select i1 %cond, i32 %sext, i32 0 From b0140bd97b08bfb326873a2a7bcee61951bb0bd5 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 14 Jul 2025 09:36:53 -0500 Subject: [PATCH 27/29] Further constrain shouldFoldSelectWithIdentityConstant(), preventing regressions --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 2012 +++++++++-------- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 302 +-- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 7 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 133 +- .../AMDGPU/fptrunc.v2f16.no.fast.math.ll | 64 +- 6 files changed, 1294 insertions(+), 1235 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5180c3805550d..44f66a1de37fc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15504,9 +15504,14 @@ bool SITargetLowering::shouldFoldSelectWithIdentityConstant( if (!CY) return false; - return (BinOpcode == ISD::AND || BinOpcode == ISD::OR || - BinOpcode == ISD::XOR) && - (VT.getScalarType() == MVT::i32); + if (!CY->getAPIntValue().isSignMask() && + !CY->getAPIntValue().isMaxSignedValue()) + return false; + + if (VT.getScalarType() != MVT::i32) + return false; + + return true; } SDValue SITargetLowering::performSetCCCombine(SDNode *N, diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 606f6d1e3939b..ba4fe3685458d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -864,20 +864,21 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_med3_i32 v5, v5, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 -; SI-NEXT: v_or_b32_e32 v7, 1, v6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v5, v0, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-NEXT: v_and_b32_e32 v5, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 @@ -913,20 +914,21 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v1 +; VI-NEXT: v_or_b32_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v3 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -958,18 +960,19 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX9-NEXT: v_med3_i32 v4, v4, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, v4, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v1, 0xfffffc10, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX9-NEXT: v_lshl_or_b32 v4, v1, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v4, 7, v3 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -999,35 +1002,36 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 ; GFX11-NEXT: v_med3_i32 v3, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_lshl_or_b32 v7, v1, 12, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v5 -; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 12, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX11-NEXT: v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc double %mag to half @@ -1053,31 +1057,29 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI-NEXT: s_or_b32 s3, s0, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s5, v0 ; SI-NEXT: s_lshr_b32 s6, s3, s5 -; SI-NEXT: s_or_b32 s7, s6, 1 ; SI-NEXT: s_lshl_b32 s5, s6, s5 ; SI-NEXT: s_cmp_lg_u32 s5, s3 -; SI-NEXT: s_cselect_b32 s3, s7, s6 -; SI-NEXT: s_add_i32 s8, s4, 0xfffffc10 -; SI-NEXT: s_lshl_b32 s4, s8, 12 -; SI-NEXT: s_or_b32 s4, s0, s4 -; SI-NEXT: s_cmp_lt_i32 s8, 1 -; SI-NEXT: s_cselect_b32 s3, s3, s4 -; SI-NEXT: s_and_b32 s6, s3, 7 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 5 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_cselect_b32 s3, 1, 0 +; SI-NEXT: s_addk_i32 s4, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s4, 12 +; SI-NEXT: s_or_b32 s3, s6, s3 +; SI-NEXT: s_or_b32 s5, s0, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cselect_b32 s3, s3, s5 +; SI-NEXT: s_and_b32 s5, s3, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_lshr_b32 s3, s3, 2 -; SI-NEXT: s_add_i32 s3, s3, s4 -; SI-NEXT: s_cmp_lt_i32 s8, 31 +; SI-NEXT: s_add_i32 s3, s3, s5 +; SI-NEXT: s_cmp_lt_i32 s4, 31 ; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s0, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f ; SI-NEXT: s_cselect_b32 s0, s0, s3 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 @@ -1102,37 +1104,35 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI-NEXT: s_and_b32 s0, s0, 0xffe ; VI-NEXT: v_readfirstlane_b32 s3, v0 ; VI-NEXT: s_sub_i32 s4, 0x3f1, s1 -; VI-NEXT: s_or_b32 s3, s0, s3 +; VI-NEXT: s_or_b32 s0, s0, s3 ; VI-NEXT: v_med3_i32 v0, s4, 0, 13 -; VI-NEXT: s_or_b32 s0, s3, 0x1000 +; VI-NEXT: s_or_b32 s3, s0, 0x1000 ; VI-NEXT: v_readfirstlane_b32 s4, v0 -; VI-NEXT: s_lshr_b32 s5, s0, s4 -; VI-NEXT: s_or_b32 s6, s5, 1 +; VI-NEXT: s_lshr_b32 s5, s3, s4 ; VI-NEXT: s_lshl_b32 s4, s5, s4 -; VI-NEXT: s_cmp_lg_u32 s4, s0 -; VI-NEXT: s_cselect_b32 s0, s6, s5 -; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10 -; VI-NEXT: s_lshl_b32 s1, s6, 12 -; VI-NEXT: s_or_b32 s1, s3, s1 -; VI-NEXT: s_cmp_lt_i32 s6, 1 -; VI-NEXT: s_cselect_b32 s7, s0, s1 -; VI-NEXT: s_and_b32 s4, s7, 7 -; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lg_u32 s4, s3 +; VI-NEXT: s_cselect_b32 s3, 1, 0 +; VI-NEXT: s_addk_i32 s1, 0xfc10 +; VI-NEXT: s_lshl_b32 s4, s1, 12 +; VI-NEXT: s_or_b32 s3, s5, s3 +; VI-NEXT: s_or_b32 s4, s0, s4 +; VI-NEXT: s_cmp_lt_i32 s1, 1 +; VI-NEXT: s_cselect_b32 s3, s3, s4 +; VI-NEXT: s_and_b32 s4, s3, 7 ; VI-NEXT: s_cmp_gt_i32 s4, 5 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, 1, 0 -; VI-NEXT: s_lshr_b32 s1, s7, 2 -; VI-NEXT: s_add_i32 s1, s1, s0 -; VI-NEXT: s_cmp_lt_i32 s6, 31 -; VI-NEXT: s_cselect_b32 s0, s1, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_movk_i32 s1, 0x7e00 -; VI-NEXT: s_cselect_b32 s1, s1, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f -; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: s_cselect_b32 s5, 1, 0 +; VI-NEXT: s_cmp_eq_u32 s4, 3 +; VI-NEXT: s_cselect_b32 s4, 1, 0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s3, s3, 2 +; VI-NEXT: s_add_i32 s3, s3, s4 +; VI-NEXT: s_cmp_lt_i32 s1, 31 +; VI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_movk_i32 s0, 0x7e00 +; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; VI-NEXT: s_cselect_b32 s0, s0, s3 ; VI-NEXT: s_movk_i32 s1, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1152,37 +1152,35 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9-NEXT: s_and_b32 s0, s0, 0xffe ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s1 -; GFX9-NEXT: s_or_b32 s3, s0, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: v_med3_i32 v0, s4, 0, 13 -; GFX9-NEXT: s_or_b32 s0, s3, 0x1000 +; GFX9-NEXT: s_or_b32 s3, s0, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_lshr_b32 s5, s0, s4 -; GFX9-NEXT: s_or_b32 s6, s5, 1 +; GFX9-NEXT: s_lshr_b32 s5, s3, s4 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 -; GFX9-NEXT: s_cmp_lg_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s1, 0xfffffc10 -; GFX9-NEXT: s_lshl_b32 s1, s6, 12 -; GFX9-NEXT: s_or_b32 s1, s3, s1 -; GFX9-NEXT: s_cmp_lt_i32 s6, 1 -; GFX9-NEXT: s_cselect_b32 s7, s0, s1 -; GFX9-NEXT: s_and_b32 s4, s7, 7 -; GFX9-NEXT: s_cmp_eq_u32 s4, 3 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_addk_i32 s1, 0xfc10 +; GFX9-NEXT: s_lshl_b32 s4, s1, 12 +; GFX9-NEXT: s_or_b32 s3, s5, s3 +; GFX9-NEXT: s_or_b32 s4, s0, s4 +; GFX9-NEXT: s_cmp_lt_i32 s1, 1 +; GFX9-NEXT: s_cselect_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s3, 7 ; GFX9-NEXT: s_cmp_gt_i32 s4, 5 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 -; GFX9-NEXT: s_lshr_b32 s1, s7, 2 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_cmp_lt_i32 s6, 31 -; GFX9-NEXT: s_cselect_b32 s0, s1, 0x7c00 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_movk_i32 s1, 0x7e00 -; GFX9-NEXT: s_cselect_b32 s1, s1, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x40f -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 3 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s3, s3, 2 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_cmp_lt_i32 s1, 31 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x7e00 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s1, 0x40f +; GFX9-NEXT: s_cselect_b32 s0, s0, s3 ; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1213,26 +1211,23 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s5, s3, s4 ; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_or_b32 s6, s5, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s4, s3 -; GFX11-NEXT: s_cselect_b32 s3, s6, s5 +; GFX11-NEXT: s_cselect_b32 s3, 1, 0 ; GFX11-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s3, s5, s3 ; GFX11-NEXT: s_lshl_b32 s4, s0, 12 ; GFX11-NEXT: s_or_b32 s4, s1, s4 ; GFX11-NEXT: s_cmp_lt_i32 s0, 1 ; GFX11-NEXT: s_cselect_b32 s3, s3, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s3, 7 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-NEXT: s_cmp_gt_i32 s4, 5 -; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 3 ; GFX11-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-NEXT: s_or_b32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s3, s3, s4 ; GFX11-NEXT: s_cmp_lt_i32 s0, 31 @@ -3034,27 +3029,28 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 -; SI-NEXT: s_movk_i32 s6, 0x3f1 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_sub_i32_e32 v8, vcc, s6, v7 +; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 ; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2 ; SI-NEXT: v_med3_i32 v8, v8, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v10, 1, v9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 -; SI-NEXT: s_movk_i32 s7, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: s_movk_i32 s5, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 ; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v8, v2, v8 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_and_b32_e32 v8, 7, v6 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 @@ -3062,9 +3058,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_movk_i32 s8, 0x40f +; SI-NEXT: s_movk_i32 s6, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1 @@ -3077,24 +3073,25 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v6, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s6, v6 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; SI-NEXT: v_or_b32_e32 v11, 1, v10 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v7, v0, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v10 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 @@ -3103,7 +3100,7 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -3127,27 +3124,28 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 -; VI-NEXT: s_movk_i32 s6, 0x3f1 +; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v2, v5, v2 -; VI-NEXT: v_sub_u32_e32 v6, vcc, s6, v3 +; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v5 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 -; VI-NEXT: v_or_b32_e32 v8, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v5 -; VI-NEXT: s_movk_i32 s7, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 +; VI-NEXT: s_movk_i32 s5, 0xfc10 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; VI-NEXT: v_or_b32_e32 v5, v7, v5 ; VI-NEXT: v_or_b32_e32 v6, v2, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v6, v6, v7 ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 @@ -3155,9 +3153,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; VI-NEXT: v_mov_b32_e32 v7, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_movk_i32 s8, 0x40f +; VI-NEXT: s_movk_i32 s6, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 @@ -3167,31 +3165,32 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: v_sub_u32_e32 v5, vcc, s6, v1 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; VI-NEXT: v_med3_i32 v5, v5, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_e32 v9, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 +; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 ; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v1 +; VI-NEXT: v_or_b32_e32 v3, v8, v3 ; VI-NEXT: v_or_b32_e32 v5, v0, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; VI-NEXT: v_and_b32_e32 v5, 7, v3 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v5, v5, v8 ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -3203,31 +3202,32 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x1ff -; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x1ff +; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: s_movk_i32 s7, 0xffe +; GFX9-NEXT: s_movk_i32 s5, 0xffe ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v5, s7, v0 +; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6 ; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_or_b32_e32 v9, 1, v8 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -3235,46 +3235,47 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_movk_i32 s8, 0x40f +; GFX9-NEXT: s_movk_i32 s6, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_mov_b32 s9, 0x8000 -; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 +; GFX9-NEXT: s_mov_b32 s7, 0x8000 +; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v6, v6, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, v6, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, v6, v9 -; GFX9-NEXT: v_or_b32_e32 v10, 1, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX9-NEXT: v_lshl_or_b32 v6, v5, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff @@ -3288,11 +3289,12 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 20, 11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 20, 11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v5, v2 @@ -3309,59 +3311,61 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 1, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 1, v12 -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v11, v13 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7e00 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v7, 12, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v6, 12, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 12, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v7, 12, v0 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v10, v5 :: v_dual_mov_b32 v10, 0x7e00 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 7, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 7, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 7, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v11 :: v_dual_add_nc_u32 v5, v5, v12 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v10 :: v_dual_add_nc_u32 v5, v5, v11 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v9, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v12, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v9, v0 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v12, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 @@ -3374,15 +3378,17 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 20, 11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, 0x7e00 ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v7 @@ -3395,59 +3401,62 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 1, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 1, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v14 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, 0x7e00 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v7, 12, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v6, 12, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v6, 12, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v7, 12, v2 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 7, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, 0x7c00, v8 :: v_dual_add_nc_u32 v5, v5, v10 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc <2 x double> %mag to <2 x half> @@ -3844,82 +3853,78 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_or_b32 s4, s0, 0x1000 ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_lshr_b32 s7, s4, s6 -; SI-NEXT: s_or_b32 s8, s7, 1 ; SI-NEXT: s_lshl_b32 s6, s7, s6 ; SI-NEXT: s_cmp_lg_u32 s6, s4 -; SI-NEXT: s_cselect_b32 s4, s8, s7 -; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10 -; SI-NEXT: s_lshl_b32 s5, s8, 12 -; SI-NEXT: s_or_b32 s5, s0, s5 -; SI-NEXT: s_cmp_lt_i32 s8, 1 -; SI-NEXT: s_cselect_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s6, s9, 7 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 5 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_cselect_b32 s4, 1, 0 -; SI-NEXT: s_lshr_b32 s5, s9, 2 -; SI-NEXT: s_add_i32 s5, s5, s4 -; SI-NEXT: s_cmp_lt_i32 s8, 31 -; SI-NEXT: s_cselect_b32 s4, s5, 0x7c00 +; SI-NEXT: s_addk_i32 s5, 0xfc10 +; SI-NEXT: s_lshl_b32 s6, s5, 12 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_or_b32 s6, s0, s6 +; SI-NEXT: s_cmp_lt_i32 s5, 1 +; SI-NEXT: s_cselect_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s4, 7 +; SI-NEXT: s_cmp_gt_i32 s6, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 3 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_lshr_b32 s4, s4, 2 +; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_cmp_lt_i32 s5, 31 +; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s6, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f ; SI-NEXT: s_cselect_b32 s0, s0, s4 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s7, s1, s0 +; SI-NEXT: s_or_b32 s4, s1, s0 ; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s4, s0, 0xffe +; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: s_bfe_u32 s1, s3, 0xb0014 -; SI-NEXT: s_or_b32 s2, s4, s0 -; SI-NEXT: s_sub_i32 s4, 0x3f1, s1 -; SI-NEXT: v_med3_i32 v2, s4, 0, 13 -; SI-NEXT: s_or_b32 s0, s2, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshr_b32 s5, s0, s4 -; SI-NEXT: s_or_b32 s8, s5, 1 -; SI-NEXT: s_lshl_b32 s4, s5, s4 -; SI-NEXT: s_cmp_lg_u32 s4, s0 -; SI-NEXT: s_cselect_b32 s0, s8, s5 -; SI-NEXT: s_add_i32 s8, s1, 0xfffffc10 -; SI-NEXT: s_lshl_b32 s1, s8, 12 -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_cmp_lt_i32 s8, 1 -; SI-NEXT: s_cselect_b32 s9, s0, s1 -; SI-NEXT: s_and_b32 s4, s9, 7 -; SI-NEXT: s_cmp_eq_u32 s4, 3 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s4, 5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s0, 1, 0 -; SI-NEXT: s_lshr_b32 s1, s9, 2 -; SI-NEXT: s_add_i32 s1, s1, s0 -; SI-NEXT: s_cmp_lt_i32 s8, 31 -; SI-NEXT: s_cselect_b32 s0, s1, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_cselect_b32 s1, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f -; SI-NEXT: s_cselect_b32 s0, s1, s0 +; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; SI-NEXT: s_or_b32 s0, s5, s0 +; SI-NEXT: s_sub_i32 s5, 0x3f1, s2 +; SI-NEXT: v_med3_i32 v2, s5, 0, 13 +; SI-NEXT: s_or_b32 s1, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: s_lshr_b32 s7, s1, s5 +; SI-NEXT: s_lshl_b32 s5, s7, s5 +; SI-NEXT: s_cmp_lg_u32 s5, s1 +; SI-NEXT: s_cselect_b32 s1, 1, 0 +; SI-NEXT: s_addk_i32 s2, 0xfc10 +; SI-NEXT: s_lshl_b32 s5, s2, 12 +; SI-NEXT: s_or_b32 s1, s7, s1 +; SI-NEXT: s_or_b32 s5, s0, s5 +; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_cselect_b32 s1, s1, s5 +; SI-NEXT: s_and_b32 s5, s1, 7 +; SI-NEXT: s_cmp_gt_i32 s5, 5 +; SI-NEXT: s_cselect_b32 s7, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cselect_b32 s5, 1, 0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_lshr_b32 s1, s1, 2 +; SI-NEXT: s_add_i32 s1, s1, s5 +; SI-NEXT: s_cmp_lt_i32 s2, 31 +; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s1 ; SI-NEXT: s_lshr_b32 s1, s3, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s0, s1, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v0, s0, v2, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -3942,38 +3947,36 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s2, v0 ; VI-NEXT: s_sub_i32 s6, 0x3f1, s3 -; VI-NEXT: s_or_b32 s5, s5, s2 +; VI-NEXT: s_or_b32 s2, s5, s2 ; VI-NEXT: v_med3_i32 v0, s6, 0, 13 -; VI-NEXT: s_or_b32 s2, s5, 0x1000 +; VI-NEXT: s_or_b32 s5, s2, 0x1000 ; VI-NEXT: v_readfirstlane_b32 s6, v0 -; VI-NEXT: s_lshr_b32 s7, s2, s6 -; VI-NEXT: s_or_b32 s8, s7, 1 +; VI-NEXT: s_lshr_b32 s7, s5, s6 ; VI-NEXT: s_lshl_b32 s6, s7, s6 -; VI-NEXT: s_cmp_lg_u32 s6, s2 -; VI-NEXT: s_cselect_b32 s2, s8, s7 -; VI-NEXT: s_add_i32 s8, s3, 0xfffffc10 -; VI-NEXT: s_lshl_b32 s3, s8, 12 -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_cmp_lt_i32 s8, 1 -; VI-NEXT: s_cselect_b32 s9, s2, s3 -; VI-NEXT: s_and_b32 s6, s9, 7 -; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_lg_u32 s6, s5 +; VI-NEXT: s_cselect_b32 s5, 1, 0 +; VI-NEXT: s_addk_i32 s3, 0xfc10 +; VI-NEXT: s_lshl_b32 s6, s3, 12 +; VI-NEXT: s_or_b32 s5, s7, s5 +; VI-NEXT: s_or_b32 s6, s2, s6 +; VI-NEXT: s_cmp_lt_i32 s3, 1 +; VI-NEXT: s_cselect_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s5, 7 ; VI-NEXT: s_cmp_gt_i32 s6, 5 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; VI-NEXT: s_cselect_b32 s2, 1, 0 -; VI-NEXT: s_lshr_b32 s3, s9, 2 -; VI-NEXT: s_add_i32 s3, s3, s2 -; VI-NEXT: s_cmp_lt_i32 s8, 31 -; VI-NEXT: s_cselect_b32 s2, s3, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s5, 0 -; VI-NEXT: s_movk_i32 s5, 0x7e00 -; VI-NEXT: s_cselect_b32 s3, s5, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s8, 0x40f -; VI-NEXT: s_cselect_b32 s2, s3, s2 -; VI-NEXT: s_lshl_b32 s6, s2, 16 +; VI-NEXT: s_cselect_b32 s7, 1, 0 +; VI-NEXT: s_cmp_eq_u32 s6, 3 +; VI-NEXT: s_cselect_b32 s6, 1, 0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_lshr_b32 s5, s5, 2 +; VI-NEXT: s_add_i32 s5, s5, s6 +; VI-NEXT: s_cmp_lt_i32 s3, 31 +; VI-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_movk_i32 s6, 0x7e00 +; VI-NEXT: s_cselect_b32 s2, s6, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s3, 0x40f +; VI-NEXT: s_cselect_b32 s2, s2, s5 +; VI-NEXT: s_lshl_b32 s5, s2, 16 ; VI-NEXT: s_lshr_b32 s2, s1, 8 ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff @@ -3983,39 +3986,37 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 -; VI-NEXT: s_or_b32 s7, s7, s0 -; VI-NEXT: v_med3_i32 v0, s2, 0, 13 -; VI-NEXT: s_or_b32 s0, s7, 0x1000 -; VI-NEXT: v_readfirstlane_b32 s2, v0 -; VI-NEXT: s_lshr_b32 s3, s0, s2 -; VI-NEXT: s_or_b32 s8, s3, 1 -; VI-NEXT: s_lshl_b32 s2, s3, s2 -; VI-NEXT: s_cmp_lg_u32 s2, s0 -; VI-NEXT: s_cselect_b32 s0, s8, s3 -; VI-NEXT: s_add_i32 s8, s1, 0xfffffc10 -; VI-NEXT: s_lshl_b32 s1, s8, 12 -; VI-NEXT: s_or_b32 s1, s7, s1 -; VI-NEXT: s_cmp_lt_i32 s8, 1 -; VI-NEXT: s_cselect_b32 s9, s0, s1 -; VI-NEXT: s_and_b32 s2, s9, 7 -; VI-NEXT: s_cmp_eq_u32 s2, 3 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_gt_i32 s2, 5 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, 1, 0 -; VI-NEXT: s_lshr_b32 s1, s9, 2 -; VI-NEXT: s_add_i32 s1, s1, s0 -; VI-NEXT: s_cmp_lt_i32 s8, 31 -; VI-NEXT: s_cselect_b32 s0, s1, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s7, 0 -; VI-NEXT: s_cselect_b32 s1, s5, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s8, 0x40f -; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: s_sub_i32 s3, 0x3f1, s1 +; VI-NEXT: s_or_b32 s0, s7, s0 +; VI-NEXT: v_med3_i32 v0, s3, 0, 13 +; VI-NEXT: s_or_b32 s2, s0, 0x1000 +; VI-NEXT: v_readfirstlane_b32 s3, v0 +; VI-NEXT: s_lshr_b32 s7, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s7, s3 +; VI-NEXT: s_cmp_lg_u32 s3, s2 +; VI-NEXT: s_cselect_b32 s2, 1, 0 +; VI-NEXT: s_addk_i32 s1, 0xfc10 +; VI-NEXT: s_lshl_b32 s3, s1, 12 +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_or_b32 s3, s0, s3 +; VI-NEXT: s_cmp_lt_i32 s1, 1 +; VI-NEXT: s_cselect_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s3, s2, 7 +; VI-NEXT: s_cmp_gt_i32 s3, 5 +; VI-NEXT: s_cselect_b32 s7, 1, 0 +; VI-NEXT: s_cmp_eq_u32 s3, 3 +; VI-NEXT: s_cselect_b32 s3, 1, 0 +; VI-NEXT: s_or_b32 s3, s3, s7 +; VI-NEXT: s_lshr_b32 s2, s2, 2 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: s_cmp_lt_i32 s1, 31 +; VI-NEXT: s_cselect_b32 s2, s2, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cselect_b32 s0, s6, 0x7c00 +; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f +; VI-NEXT: s_cselect_b32 s0, s0, s2 ; VI-NEXT: s_and_b32 s0, s0, 0x7fff -; VI-NEXT: s_or_b32 s0, s0, s6 +; VI-NEXT: s_or_b32 s0, s0, s5 ; VI-NEXT: s_mov_b32 s1, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4040,31 +4041,29 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_or_b32 s5, s2, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_lshr_b32 s8, s5, s7 -; GFX9-NEXT: s_or_b32 s9, s8, 1 ; GFX9-NEXT: s_lshl_b32 s7, s8, s7 ; GFX9-NEXT: s_cmp_lg_u32 s7, s5 -; GFX9-NEXT: s_cselect_b32 s5, s9, s8 -; GFX9-NEXT: s_add_i32 s10, s6, 0xfffffc10 -; GFX9-NEXT: s_lshl_b32 s6, s10, 12 -; GFX9-NEXT: s_or_b32 s6, s2, s6 -; GFX9-NEXT: s_cmp_lt_i32 s10, 1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s8, s5, 7 -; GFX9-NEXT: s_cmp_eq_u32 s8, 3 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_cmp_gt_i32 s8, 5 -; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_addk_i32 s6, 0xfc10 +; GFX9-NEXT: s_lshl_b32 s7, s6, 12 +; GFX9-NEXT: s_or_b32 s5, s8, s5 +; GFX9-NEXT: s_or_b32 s7, s2, s7 +; GFX9-NEXT: s_cmp_lt_i32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s5, 7 +; GFX9-NEXT: s_cmp_gt_i32 s7, 5 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshr_b32 s5, s5, 2 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s10, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_cmp_lt_i32 s6, 31 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_movk_i32 s8, 0x7e00 -; GFX9-NEXT: s_cselect_b32 s2, s8, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s10, 0x40f +; GFX9-NEXT: s_movk_i32 s7, 0x7e00 +; GFX9-NEXT: s_cselect_b32 s2, s7, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x40f ; GFX9-NEXT: s_cselect_b32 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0x8000 @@ -4083,31 +4082,29 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: v_med3_i32 v0, s6, 0, 13 ; GFX9-NEXT: s_or_b32 s2, s0, 0x1000 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_lshr_b32 s7, s2, s6 -; GFX9-NEXT: s_or_b32 s9, s7, 1 -; GFX9-NEXT: s_lshl_b32 s6, s7, s6 +; GFX9-NEXT: s_lshr_b32 s8, s2, s6 +; GFX9-NEXT: s_lshl_b32 s6, s8, s6 ; GFX9-NEXT: s_cmp_lg_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s9, s7 -; GFX9-NEXT: s_add_i32 s9, s3, 0xfffffc10 -; GFX9-NEXT: s_lshl_b32 s3, s9, 12 -; GFX9-NEXT: s_or_b32 s3, s0, s3 -; GFX9-NEXT: s_cmp_lt_i32 s9, 1 -; GFX9-NEXT: s_cselect_b32 s10, s2, s3 -; GFX9-NEXT: s_and_b32 s6, s10, 7 -; GFX9-NEXT: s_cmp_eq_u32 s6, 3 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_gt_i32 s6, 5 -; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX9-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_lshr_b32 s3, s10, 2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_cmp_lt_i32 s9, 31 -; GFX9-NEXT: s_cselect_b32 s2, s3, 0x7c00 +; GFX9-NEXT: s_addk_i32 s3, 0xfc10 +; GFX9-NEXT: s_lshl_b32 s6, s3, 12 +; GFX9-NEXT: s_or_b32 s2, s8, s2 +; GFX9-NEXT: s_or_b32 s6, s0, s6 +; GFX9-NEXT: s_cmp_lt_i32 s3, 1 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s2, 7 +; GFX9-NEXT: s_cmp_gt_i32 s6, 5 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_lshr_b32 s2, s2, 2 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_cmp_lt_i32 s3, 31 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7c00 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s0, s8, 0x7c00 -; GFX9-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX9-NEXT: s_cselect_b32 s0, s7, 0x7c00 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x40f ; GFX9-NEXT: s_cselect_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0x8000 @@ -4142,26 +4139,23 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s8, s6, s7 ; GFX11-NEXT: s_lshl_b32 s7, s8, s7 -; GFX11-NEXT: s_or_b32 s9, s8, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s7, s6 -; GFX11-NEXT: s_cselect_b32 s6, s9, s8 +; GFX11-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s6, s8, s6 ; GFX11-NEXT: s_lshl_b32 s7, s2, 12 ; GFX11-NEXT: s_or_b32 s7, s5, s7 ; GFX11-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s7, s6, 7 -; GFX11-NEXT: s_cmp_eq_u32 s7, 3 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_gt_i32 s7, 5 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s7, exec_lo +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s7, 3 ; GFX11-NEXT: s_cselect_b32 s7, 1, 0 ; GFX11-NEXT: s_lshr_b32 s6, s6, 2 +; GFX11-NEXT: s_or_b32 s7, s7, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s6, s6, s7 ; GFX11-NEXT: s_cmp_lt_i32 s2, 31 @@ -4195,26 +4189,23 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s8, s5, s6 ; GFX11-NEXT: s_lshl_b32 s6, s8, s6 -; GFX11-NEXT: s_or_b32 s9, s8, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-NEXT: s_cselect_b32 s5, s9, s8 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s8, s5 ; GFX11-NEXT: s_lshl_b32 s6, s0, 12 ; GFX11-NEXT: s_or_b32 s6, s3, s6 ; GFX11-NEXT: s_cmp_lt_i32 s0, 1 ; GFX11-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s6, s6, s8 -; GFX11-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: s_cmp_lt_i32 s0, 31 @@ -4674,27 +4665,28 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_and_b32_e32 v9, 0xffe, v9 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v10, v5, 20, 11 -; SI-NEXT: s_movk_i32 s6, 0x3f1 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s6, v10 +; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 ; SI-NEXT: v_or_b32_e32 v9, 0x1000, v4 ; SI-NEXT: v_med3_i32 v11, v11, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v9 ; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_or_b32_e32 v13, 1, v12 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v9 -; SI-NEXT: s_movk_i32 s7, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_movk_i32 s5, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 ; SI-NEXT: v_or_b32_e32 v11, v4, v11 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc ; SI-NEXT: v_and_b32_e32 v11, 7, v9 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_lshrrev_b32_e32 v9, 2, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 @@ -4702,9 +4694,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc ; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_movk_i32 s8, 0x40f +; SI-NEXT: s_movk_i32 s6, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 ; SI-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3 @@ -4717,31 +4709,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v9, v3, 20, 11 ; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s6, v9 +; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v9 ; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; SI-NEXT: v_med3_i32 v10, v10, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v5 ; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; SI-NEXT: v_or_b32_e32 v14, 1, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, s5, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v9 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 ; SI-NEXT: v_or_b32_e32 v10, v2, v10 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v9 ; SI-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc ; SI-NEXT: v_and_b32_e32 v10, 7, v5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v10, v10, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v9 ; SI-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v9 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 @@ -4754,24 +4747,25 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v9, vcc, s6, v5 +; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v5 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v9, v9, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v3 ; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 -; SI-NEXT: v_or_b32_e32 v13, 1, v10 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v10, v13, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v5 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v9, v0, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; SI-NEXT: v_and_b32_e32 v9, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 @@ -4779,7 +4773,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -4804,27 +4798,28 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_and_b32_e32 v8, 0xffe, v8 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v5, v5, 20, 11 -; VI-NEXT: s_movk_i32 s6, 0x3f1 +; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v4, v8, v4 -; VI-NEXT: v_sub_u32_e32 v9, vcc, s6, v5 +; VI-NEXT: v_sub_u32_e32 v9, vcc, s4, v5 ; VI-NEXT: v_or_b32_e32 v8, 0x1000, v4 ; VI-NEXT: v_med3_i32 v9, v9, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v10, v9, v8 ; VI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 -; VI-NEXT: v_or_b32_e32 v11, 1, v10 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 -; VI-NEXT: s_movk_i32 s7, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, s7, v5 +; VI-NEXT: s_movk_i32 s5, 0xfc10 +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 ; VI-NEXT: v_lshlrev_b32_e32 v9, 12, v5 +; VI-NEXT: v_or_b32_e32 v8, v10, v8 ; VI-NEXT: v_or_b32_e32 v9, v4, v9 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; VI-NEXT: v_and_b32_e32 v9, 7, v8 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 2, v8 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; VI-NEXT: v_mov_b32_e32 v9, 0x7c00 @@ -4832,9 +4827,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; VI-NEXT: v_mov_b32_e32 v10, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: s_movk_i32 s8, 0x40f +; VI-NEXT: s_movk_i32 s6, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; VI-NEXT: v_and_b32_e32 v8, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 @@ -4844,31 +4839,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: v_sub_u32_e32 v8, vcc, s6, v1 +; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; VI-NEXT: v_med3_i32 v8, v8, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v11, v8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; VI-NEXT: v_or_b32_e32 v12, 1, v11 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v1 +; VI-NEXT: v_or_b32_e32 v5, v11, v5 ; VI-NEXT: v_or_b32_e32 v8, v0, v8 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; VI-NEXT: v_and_b32_e32 v8, 7, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v8, v8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v3 ; VI-NEXT: v_or_b32_e32 v2, v5, v2 @@ -4878,31 +4874,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_sub_u32_e32 v5, vcc, s6, v3 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; VI-NEXT: v_med3_i32 v5, v5, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v2 ; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; VI-NEXT: v_or_b32_e32 v11, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v3 +; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_or_b32_e32 v5, v1, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; VI-NEXT: v_and_b32_e32 v5, 7, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v5, v5, v8 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4915,31 +4912,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x1ff -; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v4 +; GFX9-NEXT: s_movk_i32 s4, 0x1ff +; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: s_movk_i32 s7, 0xffe +; GFX9-NEXT: s_movk_i32 s5, 0xffe ; GFX9-NEXT: v_bfe_u32 v5, v5, 20, 11 -; GFX9-NEXT: v_and_or_b32 v4, v8, s7, v4 +; GFX9-NEXT: v_and_or_b32 v4, v8, s5, v4 ; GFX9-NEXT: v_sub_u32_e32 v9, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v8, 0x1000, v4 ; GFX9-NEXT: v_med3_i32 v9, v9, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, v9, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_or_b32_e32 v11, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc +; GFX9-NEXT: v_or_b32_e32 v8, v10, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v5, 12, v4 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_and_b32_e32 v9, 7, v8 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v9 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v8 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7c00 @@ -4947,77 +4945,79 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_movk_i32 s8, 0x40f +; GFX9-NEXT: s_movk_i32 s6, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 -; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v5, s7, v0 +; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v8 ; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, v11, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 1, v12 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 0xfffffc10, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc +; GFX9-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX9-NEXT: v_lshl_or_b32 v11, v8, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v11, 7, v5 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_mov_b32 s9, 0x8000 -; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 +; GFX9-NEXT: s_mov_b32 s7, 0x8000 +; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v8, 0x3f1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v8, v8, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, v8, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, v8, v11 -; GFX9-NEXT: v_or_b32_e32 v12, 1, v11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX9-NEXT: v_lshl_or_b32 v8, v5, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v8 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v8, v8, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff @@ -5031,106 +5031,108 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 20, 11 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 20, 11 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v5, 20, 11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v1, 20, 11 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v1, 20, 11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v5 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v11, 0x3f1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v8, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v8, 0x3f1, v10 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_med3_i32 v10, v10, 0, 13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x1000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v8, v2 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v8, 0x3f1, v9 +; GFX11-TRUE16-NEXT: v_med3_i32 v11, v11, 0, 13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v9, v2 +; GFX11-TRUE16-NEXT: v_med3_i32 v8, v8, 0, 13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x1000, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, v10, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v2 -; GFX11-TRUE16-NEXT: v_med3_i32 v8, v8, 0, 13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v15, v0 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v15, 0x3f1, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, v8, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_med3_i32 v15, v15, 0, 13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, v11, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v13, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v13, 0x3f1, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, v8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x1000, v0 +; GFX11-TRUE16-NEXT: v_med3_i32 v13, v13, 0, 13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v14, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0xfffffc10, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x1000, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v9, 12, v2 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v15, v14 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s3, 31, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v13, v17 :: v_dual_and_b32 v13, 7, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v13, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v10, 12, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0xfffffc10, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, v15, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 12, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v15, v14 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v12, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v16, 12, v0 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v12, v11, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 7, v10 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 2, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v12 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 7, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 12, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v12, v11 :: v_dual_and_b32 v12, 7, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v11 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 3, v14 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s1, 5, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0x7c00, v8, s3 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0x7c00, v12, s0 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s2, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v17 :: v_dual_add_nc_u32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v8, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5138,7 +5140,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 @@ -5151,114 +5153,123 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v5, 20, 11 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 20, 11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v5 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v1, 20, 11 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v3 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v15, 0x3f1, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v3, 20, 11 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: v_med3_i32 v9, v9, 0, 13 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v3, 20, 11 -; GFX11-FAKE16-NEXT: v_med3_i32 v15, v15, 0, 13 -; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v5, v4 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x1000, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v8, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v10 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, v9, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x1000, v0 +; GFX11-FAKE16-NEXT: v_med3_i32 v5, v5, 0, 13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v12, v2 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, v9, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 1, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, v9, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0xfffffc10, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, v5, v14 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v2 ; GFX11-FAKE16-NEXT: v_med3_i32 v12, v12, 0, 13 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v13, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0xfffffc10, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, v12, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v5, 12, v4 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 1, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, v15, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 1, v11 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v11, v15 :: v_dual_add_nc_u32 v10, 0xfffffc10, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v12, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v15, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0xfffffc10, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v10, 12, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v8, 12, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v10, 12, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v14, 12, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v13, v12 :: v_dual_and_b32 v13, 7, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v13, 12, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 7, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v12 -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v13 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v13 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v11 -; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, s1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v13 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 7, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 7, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, 0x7e00 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7c00, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, 0x7c00, v12, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v12 :: v_dual_add_nc_u32 v11, v11, v15 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v8 ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v8 ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc <3 x double> %mag to <3 x half> @@ -5797,27 +5808,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v13, v7, 20, 11 -; SI-NEXT: s_movk_i32 s6, 0x3f1 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v6, v12, v6 -; SI-NEXT: v_sub_i32_e32 v14, vcc, s6, v13 +; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 ; SI-NEXT: v_or_b32_e32 v12, 0x1000, v6 ; SI-NEXT: v_med3_i32 v14, v14, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 -; SI-NEXT: v_or_b32_e32 v16, 1, v15 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v14, v12 -; SI-NEXT: s_movk_i32 s7, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v12, v15, v16, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_movk_i32 s5, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 +; SI-NEXT: v_or_b32_e32 v12, v15, v12 ; SI-NEXT: v_or_b32_e32 v14, v6, v14 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_and_b32_e32 v14, 7, v12 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v14 +; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v14 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; SI-NEXT: v_mov_b32_e32 v14, 0x7c00 @@ -5825,9 +5837,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: s_movk_i32 s8, 0x40f +; SI-NEXT: s_movk_i32 s6, 0x40f ; SI-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v13 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 ; SI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v5 @@ -5840,31 +5852,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v12, v5, 20, 11 ; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_sub_i32_e32 v13, vcc, s6, v12 +; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 ; SI-NEXT: v_or_b32_e32 v7, 0x1000, v4 ; SI-NEXT: v_med3_i32 v13, v13, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v7 ; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 -; SI-NEXT: v_or_b32_e32 v17, 1, v16 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v7 -; SI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 ; SI-NEXT: v_or_b32_e32 v13, v4, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 ; SI-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc ; SI-NEXT: v_and_b32_e32 v13, 7, v7 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v13 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v13, v13, v16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 ; SI-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 ; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3 @@ -5877,31 +5890,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v7, v3, 20, 11 ; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s6, v7 +; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v7 ; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; SI-NEXT: v_med3_i32 v12, v12, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v5 ; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; SI-NEXT: v_or_b32_e32 v16, 1, v13 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v13, v16, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 ; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v7 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 ; SI-NEXT: v_or_b32_e32 v12, v2, v12 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc ; SI-NEXT: v_and_b32_e32 v12, 7, v5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v12 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v5, 0x1ff, v1 @@ -5914,31 +5928,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s6, v5 +; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v5 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12 -; SI-NEXT: v_or_b32_e32 v13, 1, v12 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v5 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 ; SI-NEXT: v_or_b32_e32 v7, v0, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v3 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v12 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 @@ -5964,27 +5979,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_and_b32_e32 v10, 0xffe, v10 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v5, v5, 20, 11 -; VI-NEXT: s_movk_i32 s6, 0x3f1 +; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v4, v10, v4 -; VI-NEXT: v_sub_u32_e32 v11, vcc, s6, v5 +; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x1000, v4 ; VI-NEXT: v_med3_i32 v11, v11, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v10 ; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; VI-NEXT: v_or_b32_e32 v13, 1, v12 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v10 -; VI-NEXT: s_movk_i32 s7, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v10, v12, v13, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, s7, v5 +; VI-NEXT: s_movk_i32 s5, 0xfc10 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 ; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v5 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 ; VI-NEXT: v_or_b32_e32 v11, v4, v11 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; VI-NEXT: v_and_b32_e32 v11, 7, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 +; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v11, v11, v12 ; VI-NEXT: v_lshrrev_b32_e32 v10, 2, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; VI-NEXT: v_mov_b32_e32 v11, 0x7c00 @@ -5992,9 +6008,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; VI-NEXT: v_mov_b32_e32 v12, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: s_movk_i32 s8, 0x40f +; VI-NEXT: s_movk_i32 s6, 0x40f ; VI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; VI-NEXT: v_and_b32_e32 v10, 0x1ff, v7 ; VI-NEXT: v_or_b32_e32 v6, v10, v6 @@ -6004,31 +6020,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v7, v7, 20, 11 ; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_sub_u32_e32 v10, vcc, s6, v7 +; VI-NEXT: v_sub_u32_e32 v10, vcc, s4, v7 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v5 ; VI-NEXT: v_med3_i32 v10, v10, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v13, v10, v6 ; VI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; VI-NEXT: v_or_b32_e32 v14, 1, v13 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v13, v14, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, s7, v7 +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7 ; VI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 +; VI-NEXT: v_or_b32_e32 v6, v13, v6 ; VI-NEXT: v_or_b32_e32 v10, v5, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; VI-NEXT: v_and_b32_e32 v10, 7, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v10, v10, v13 ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; VI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v7, v0 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -6038,31 +6055,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: v_sub_u32_e32 v7, vcc, s6, v1 +; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v0 ; VI-NEXT: v_med3_i32 v7, v7, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v10, v7, v6 ; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; VI-NEXT: v_or_b32_e32 v13, 1, v10 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v10, v13, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, s7, v1 +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 ; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v1 +; VI-NEXT: v_or_b32_e32 v6, v10, v6 ; VI-NEXT: v_or_b32_e32 v7, v0, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; VI-NEXT: v_and_b32_e32 v7, 7, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v7, v7, v10 ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 ; VI-NEXT: v_or_b32_e32 v2, v6, v2 @@ -6072,31 +6090,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v3, v3, 20, 11 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_sub_u32_e32 v6, vcc, s6, v3 +; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v2 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 -; VI-NEXT: v_or_b32_e32 v10, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3 +; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v6, v1, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v6, v6, v7 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s8, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -6112,31 +6131,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x1ff -; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v4 +; GFX9-NEXT: s_movk_i32 s4, 0x1ff +; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX9-NEXT: s_movk_i32 s7, 0xffe +; GFX9-NEXT: s_movk_i32 s5, 0xffe ; GFX9-NEXT: v_bfe_u32 v11, v5, 20, 11 -; GFX9-NEXT: v_and_or_b32 v4, v10, s7, v4 +; GFX9-NEXT: v_and_or_b32 v4, v10, s5, v4 ; GFX9-NEXT: v_sub_u32_e32 v12, 0x3f1, v11 ; GFX9-NEXT: v_or_b32_e32 v10, 0x1000, v4 ; GFX9-NEXT: v_med3_i32 v12, v12, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, v12, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_or_b32_e32 v14, 1, v13 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v12, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v11, 0xfffffc10, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc +; GFX9-NEXT: v_or_b32_e32 v10, v13, v10 ; GFX9-NEXT: v_lshl_or_b32 v12, v11, 12, v4 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v12, 7, v10 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v12 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 2, v10 ; GFX9-NEXT: v_add_u32_e32 v10, v10, v12 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7c00 @@ -6144,112 +6164,115 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0x7e00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_movk_i32 s8, 0x40f +; GFX9-NEXT: s_movk_i32 s6, 0x40f ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: s_mov_b32 s9, 0x8000 -; GFX9-NEXT: v_and_or_b32 v4, v5, s9, v4 -; GFX9-NEXT: v_and_or_b32 v5, v7, s6, v6 +; GFX9-NEXT: s_mov_b32 s7, 0x8000 +; GFX9-NEXT: v_and_or_b32 v4, v5, s7, v4 +; GFX9-NEXT: v_and_or_b32 v5, v7, s4, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v7 ; GFX9-NEXT: v_bfe_u32 v10, v7, 20, 11 -; GFX9-NEXT: v_and_or_b32 v5, v6, s7, v5 +; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 ; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v10 ; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v5 ; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, v11, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 1, v14 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v10, 0xfffffc10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc +; GFX9-NEXT: v_or_b32_e32 v6, v14, v6 ; GFX9-NEXT: v_lshl_or_b32 v11, v10, 12, v5 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v11, 7, v6 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v11, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v11 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v10 -; GFX9-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 +; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_and_or_b32 v5, v6, s9, v5 +; GFX9-NEXT: v_and_or_b32 v5, v6, s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 11 -; GFX9-NEXT: v_and_or_b32 v0, v6, s7, v0 +; GFX9-NEXT: v_and_or_b32 v0, v6, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v10, 0x3f1, v7 ; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v0 ; GFX9-NEXT: v_med3_i32 v10, v10, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, v10, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_or_b32_e32 v14, 1, v11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 0xfffffc10, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v14, vcc +; GFX9-NEXT: v_or_b32_e32 v6, v11, v6 ; GFX9-NEXT: v_lshl_or_b32 v10, v7, 12, v0 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 7, v6 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v10 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v2 +; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX9-NEXT: v_bfe_u32 v6, v3, 20, 11 -; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6 ; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, v7, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; GFX9-NEXT: v_or_b32_e32 v11, 1, v10 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v1 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, s9, v1 +; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff @@ -6265,149 +6288,158 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 20, 11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v7 ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10 -; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 20, 11 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v5 ; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 20, 11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v3, 20, 11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 20, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v7.h ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0xffe, v11, v6 ; GFX11-TRUE16-NEXT: v_med3_i32 v11, v12, 0, 13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v15, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x1000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 1, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v14, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v10, 12, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v14, 0x3f1, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v10, 12, v6 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-TRUE16-NEXT: v_med3_i32 v12, v12, 0, 13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v3, 20, 11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, 0x7e00 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v16, v4 +; GFX11-TRUE16-NEXT: v_med3_i32 v14, v14, 0, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v21, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, v12, v19 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, 0x7e00 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x1000, v4 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v21, 0x3f1, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 7, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 1, v19 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v18 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v18, 0x3f1, v17 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v14, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v16, 12, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0xfffffc10, v17 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v21, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x1000, v2 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v19, v20, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 +; GFX11-TRUE16-NEXT: v_med3_i32 v21, v21, 0, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v18, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, v14, v22 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v12, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, 0x7c00, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v14, v12 :: v_dual_add_nc_u32 v11, v11, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v23 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x1000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, v21, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-TRUE16-NEXT: v_med3_i32 v14, v18, 0, 13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_and_b32 v18, 7, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v18, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, v14, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0xfffffc10, v17 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v20 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 20, 11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, v14, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 1, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, v14, v15 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 12, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v10, v0 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, v19, v21, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0xfffffc10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x1000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v17, 12, v4 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v21, v22 ; GFX11-TRUE16-NEXT: v_med3_i32 v10, v10, 0, 13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v14, v15, v14, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 1, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, v10, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 7, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 2, v14 -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x8000, v20, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x8000, v15, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v14, 12, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, v10, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v22, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 1, v7 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v22 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v11, 12, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 7, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v21 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0xfffffc10, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v10, v7, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v15 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0x40f, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v16, 12, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v16 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v10, v7 :: v_dual_add_nc_u32 v10, v12, v18 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v7 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 2, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v10, v4, s1 -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e64 s1, 31, v17 -; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x8000, v20, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v13 :: v_dual_add_nc_u32 v7, v7, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v12 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v18, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x8000, v15, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7c00, v12, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v11 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v20, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v15, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v4, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v20, v0 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v15, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 @@ -6423,144 +6455,150 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v7 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v7, 20, 11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v7, 20, 11 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v3, 20, 11 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v11, v4 ; GFX11-FAKE16-NEXT: v_med3_i32 v11, v12, 0, 13 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x1000, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffe, v13, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v11, v12 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s3, 0, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, v11, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 1, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x1000, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffe, v13, v6 +; GFX11-FAKE16-NEXT: v_med3_i32 v13, v17, 0, 13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v1, 20, 11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v14, v11 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v10, 12, v4 -; GFX11-FAKE16-NEXT: v_med3_i32 v15, v17, 0, 13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v13, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v10, 12, v4 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, v15, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v1, 20, 11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, v15, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 7, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 1, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s1, v15, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v14, 12, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, v17, v21, s1 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v20 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0xfffffc10, v20 -; GFX11-FAKE16-NEXT: v_med3_i32 v17, v17, 0, 13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v16 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v18, 0x3f1, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 7, v11 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v12, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11 +; GFX11-FAKE16-NEXT: v_med3_i32 v18, v18, 0, 13 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v16, v0 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v13 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v14, 12, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v3, 20, 11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v15 ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10 -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v18, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x1000, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_med3_i32 v12, v12, 0, 13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0xfffffc10, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, v12, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 7, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 1, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 2, v15 -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, v17, v16 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v18 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v13, 12, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, v17, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 1, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v21, v24, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v17, v16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v20, 12, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v23, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, v18, v16 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v17, v2 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0xfffffc10, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_lshlrev_b32 v18, v18, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 7, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x1000, v2 +; GFX11-FAKE16-NEXT: v_med3_i32 v17, v17, 0, 13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 2, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, v17, v21 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v18, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, 0x7e00 :: v_dual_lshlrev_b32 v17, v17, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v17, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0xfffffc10, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v12, 12, v0 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v20, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v20 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v19, 12, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v18 :: v_dual_and_b32 v15, 7, v16 +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 7, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v16 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s0, 5, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0x7c00, v19, s3 -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e64 s2, 5, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 2, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v17 ; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x8000, v5, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 2, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 2, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v15, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7c00, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v10 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v12 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v16, v15 -; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v20 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v19 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2 ; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x8000, v7, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index af0c38c5624ba..462d7748b86cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3944,10 +3944,9 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 20, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 -; SI-NEXT: v_and_b32_e32 v3, 0x7ff, v3 +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3 @@ -3955,20 +3954,21 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -3994,10 +3994,9 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v3, 20, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7ff, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 20, 11 ; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v3 @@ -4005,20 +4004,21 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4041,47 +4041,47 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 20, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7ff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x3f1, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xfffffc10, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_med3_i32 v3, v3, 0, 13 -; GFX11-NEXT: v_lshl_or_b32 v7, v2, 12, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v5 -; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v2 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v4, 7, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4 -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo +; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fpround = fptrunc double %a to half @@ -4106,20 +4106,21 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4153,20 +4154,21 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4200,32 +4202,35 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3 -; GFX11-NEXT: v_lshl_or_b32 v7, v3, 12, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg double %a @@ -4253,20 +4258,21 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_med3_i32 v5, v5, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 -; SI-NEXT: v_or_b32_e32 v7, 1, v6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v5, v0, v5 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; SI-NEXT: v_and_b32_e32 v5, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v5 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 @@ -4304,20 +4310,21 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 -; VI-NEXT: v_or_b32_e32 v8, 1, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; VI-NEXT: v_or_b32_e32 v4, v7, v4 ; VI-NEXT: v_or_b32_e32 v6, v0, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; VI-NEXT: v_and_b32_e32 v6, 7, v4 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v6 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v6, v6, v7 ; VI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00 @@ -4352,28 +4359,32 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_or_b32_e32 v7, 1, v6 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0xfffffc10, v4 -; GFX11-NEXT: v_lshl_or_b32 v8, v4, 12, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4 +; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_cndmask_b32 v3, v8, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v5, 7, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v5 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX11-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v6 :: v_dual_add_nc_u32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo @@ -4510,20 +4521,21 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_med3_i32 v7, v7, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 -; SI-NEXT: v_or_b32_e32 v9, 1, v8 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v7, v4, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; SI-NEXT: v_and_b32_e32 v7, 7, v5 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -4560,21 +4572,22 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; VI-NEXT: v_med3_i32 v7, v7, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v8, v7, v5 ; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v8 -; VI-NEXT: v_or_b32_e32 v9, 1, v8 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 ; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 +; VI-NEXT: v_or_b32_e32 v5, v8, v5 ; VI-NEXT: v_or_b32_e32 v7, v4, v7 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; VI-NEXT: v_and_b32_e32 v7, 7, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v7 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3] -; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; VI-NEXT: v_or_b32_e32 v7, v7, v8 ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mov_b32_e32 v7, 0x7c00 @@ -4612,27 +4625,28 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v8, v5, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; GFX11-NEXT: v_or_b32_e32 v9, 1, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6 -; GFX11-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v10, v6, 12, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 7, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 2, v5 +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v5 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX11-NEXT: v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v5 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 @@ -4667,20 +4681,21 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_med3_i32 v4, v4, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v6, 1, v5 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v4, v0, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-NEXT: v_and_b32_e32 v4, 7, v2 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4716,20 +4731,21 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_med3_i32 v4, v4, 0, 13 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; VI-NEXT: v_or_b32_e32 v6, 1, v5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 ; VI-NEXT: s_movk_i32 s4, 0xfc10 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; VI-NEXT: v_or_b32_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v4, v0, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; VI-NEXT: v_and_b32_e32 v4, 7, v2 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], 5, v4 -; VI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00 @@ -4764,34 +4780,36 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX11-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3 -; GFX11-NEXT: v_lshl_or_b32 v7, v3, 12, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 +; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cmp_lt_i32_e64 s0, 5, v4 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index c20b99444ab35..20809e6b5afcc 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1002,11 +1002,12 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) { ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_xor_b32_e32 v2, 0x8000, v0 -; GFX7-NEXT: v_cmp_lt_i32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index d99cf35c482a4..49c563eef5d82 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -111,36 +111,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 ; SI-NEXT: v_readfirstlane_b32 s1, v0 ; SI-NEXT: s_sub_i32 s6, 0x3f1, s0 -; SI-NEXT: s_or_b32 s10, s8, s1 +; SI-NEXT: s_or_b32 s1, s8, s1 ; SI-NEXT: v_med3_i32 v0, s6, 0, 13 -; SI-NEXT: s_or_b32 s1, s10, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: s_lshr_b32 s8, s1, s6 -; SI-NEXT: s_or_b32 s9, s8, 1 -; SI-NEXT: s_lshl_b32 s6, s8, s6 -; SI-NEXT: s_cmp_lg_u32 s6, s1 -; SI-NEXT: s_cselect_b32 s1, s9, s8 -; SI-NEXT: s_add_i32 s6, s0, 0xfffffc10 -; SI-NEXT: s_lshl_b32 s0, s6, 12 -; SI-NEXT: s_or_b32 s0, s10, s0 -; SI-NEXT: s_cmp_lt_i32 s6, 1 -; SI-NEXT: s_cselect_b32 s11, s1, s0 -; SI-NEXT: s_and_b32 s8, s11, 7 -; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_or_b32 s6, s1, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_addk_i32 s0, 0xfc10 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_lshl_b32 s8, s0, 12 +; SI-NEXT: s_or_b32 s8, s1, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 ; SI-NEXT: s_cmp_gt_i32 s8, 5 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s0, 1, 0 -; SI-NEXT: s_lshr_b32 s1, s11, 2 -; SI-NEXT: s_add_i32 s1, s1, s0 -; SI-NEXT: s_cmp_lt_i32 s6, 31 -; SI-NEXT: s_cselect_b32 s0, s1, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s10, 0 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 ; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s6, 0x40f -; SI-NEXT: s_cselect_b32 s0, s1, s0 +; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f +; SI-NEXT: s_cselect_b32 s0, s1, s6 ; SI-NEXT: s_lshr_b32 s1, s7, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s6, s1, s0 @@ -167,39 +165,37 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s5, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s6, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s5 +; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 +; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 ; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s6, 0x1000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 ; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s4, s8 -; VI-SAFE-SDAG-NEXT: s_or_b32 s10, s9, 1 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 ; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s4 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s10, s9 -; VI-SAFE-SDAG-NEXT: s_add_i32 s10, s5, 0xfffffc10 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s5, s10, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s11, s4, s5 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s11, 7 -; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 ; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; VI-SAFE-SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, 1, 0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s11, 2 -; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s4 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s5, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; VI-SAFE-SDAG-NEXT: s_movk_i32 s5, 0x7e00 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 ; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s5, s4 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 ; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 ; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 ; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 @@ -300,23 +296,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s8, s7, 1 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s8, s7 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 ; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 ; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 ; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 @@ -431,26 +425,23 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s8, s7, 1 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s8, s7 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 ; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 ; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 ; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 ; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll index 27e5b521ae8c3..d8f21d285ddff 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll @@ -284,85 +284,91 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: s_movk_i32 s2, 0x1ff -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s2, v0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0x1ff +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s0, v0 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX950-SDAG-NEXT: s_movk_i32 s3, 0xffe +; GFX950-SDAG-NEXT: s_movk_i32 s1, 0xffe ; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX950-SDAG-NEXT: v_bfe_u32 v5, v1, 20, 11 -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v4, s3, v0 +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v4, s1, v0 ; GFX950-SDAG-NEXT: v_sub_u32_e32 v6, 0x3f1, v5 ; GFX950-SDAG-NEXT: v_or_b32_e32 v4, 0x1000, v0 ; GFX950-SDAG-NEXT: v_med3_i32 v6, v6, 0, 13 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v7, v6, v4 ; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v6, v6, v7 -; GFX950-SDAG-NEXT: v_or_b32_e32 v8, 1, v7 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; GFX950-SDAG-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5 ; GFX950-SDAG-NEXT: v_lshl_or_b32 v6, v5, 12, v0 -; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x40f +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0x40f +; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX950-SDAG-NEXT: v_and_b32_e32 v6, 7, v4 -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 -; GFX950-SDAG-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v6 -; GFX950-SDAG-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0x8000 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX950-SDAG-NEXT: v_add_u32_e32 v4, v4, v6 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0x7c00 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 -; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX950-SDAG-NEXT: s_mov_b32 s5, 0x8000 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v5 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s5, v0 -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s2, v2 +; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s0, v2 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v3 ; GFX950-SDAG-NEXT: v_bfe_u32 v4, v3, 20, 11 ; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s1, v1 ; GFX950-SDAG-NEXT: v_sub_u32_e32 v5, 0x3f1, v4 ; GFX950-SDAG-NEXT: v_or_b32_e32 v2, 0x1000, v1 ; GFX950-SDAG-NEXT: v_med3_i32 v5, v5, 0, 13 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v8, v5, v2 ; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v5, v5, v8 -; GFX950-SDAG-NEXT: v_or_b32_e32 v9, 1, v8 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 ; GFX950-SDAG-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4 ; GFX950-SDAG-NEXT: v_lshl_or_b32 v5, v4, 12, v1 -; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX950-SDAG-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; GFX950-SDAG-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v5 -; GFX950-SDAG-NEXT: s_or_b64 s[0:1], s[0:1], vcc +; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX950-SDAG-NEXT: v_or_b32_e32 v5, v5, v8 ; GFX950-SDAG-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX950-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v4 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX950-SDAG-NEXT: v_perm_b32 v0, v1, v0, s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; From ce7cd980bb09e2141f526f6e2eff72ea0e36da31 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 14 Jul 2025 09:47:48 -0500 Subject: [PATCH 28/29] Fix broken insert-delay-alu-bug.ll test --- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 462 ++++++------------ 1 file changed, 154 insertions(+), 308 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index a841f7ffa02b9..9389f1614721f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -51,314 +51,160 @@ bb: ; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { - -; GFX11-TRUE16-LABEL: f2: -; GFX11-TRUE16: ; %bb.0: ; %bb -; GFX11-TRUE16-NEXT: s_mov_b64 s[16:17], s[4:5] -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s19, s[16:17], 0x24 -; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 -; GFX11-TRUE16-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-TRUE16-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-TRUE16-NEXT: s_mov_b32 s20, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo -; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_lo_u32 v0, s19, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_13 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb14 -; GFX11-TRUE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c -; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s24, -1, 0 -; GFX11-TRUE16-NEXT: s_bitcmp0_b32 s21, 0 -; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB2_3 -; GFX11-TRUE16-NEXT: ; %bb.2: ; %bb15 -; GFX11-TRUE16-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-TRUE16-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] -; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s14 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-TRUE16-NEXT: s_mov_b32 s21, s14 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s15 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-TRUE16-NEXT: s_branch .LBB2_12 -; GFX11-TRUE16-NEXT: .LBB2_3: -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: .LBB2_4: ; %bb16 -; GFX11-TRUE16-NEXT: s_load_b32 s1, s[16:17], 0x54 -; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, -1 -; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s23, 1 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s1, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s9, 0 -; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB2_8 -; GFX11-TRUE16-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-TRUE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mul_hi_u32 s8, s29, s28 -; GFX11-TRUE16-NEXT: s_mul_i32 s9, s29, s28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s9, 1 -; GFX11-TRUE16-NEXT: s_mov_b32 s9, 0 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, s30 -; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s20 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s19, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b64 s[20:21], s[8:9], 1 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 -; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-TRUE16-NEXT: .p2align 6 -; GFX11-TRUE16-NEXT: .LBB2_6: ; %bb18 -; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s8, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s1, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, exec_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s19, s13 -; GFX11-TRUE16-NEXT: s_and_b32 s13, 0xffff, s9 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 1 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s20, s2, exec_lo -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-TRUE16-NEXT: s_or_b32 s19, s9, 0x100 -; GFX11-TRUE16-NEXT: s_and_b32 s13, 1, s13 -; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s13, 1 -; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s19, s9 -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_6 -; GFX11-TRUE16-NEXT: ; %bb.7: ; %Flow -; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 -; GFX11-TRUE16-NEXT: .LBB2_8: ; %Flow12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s8 -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_12 -; GFX11-TRUE16-NEXT: ; %bb.9: -; GFX11-TRUE16-NEXT: s_xor_b32 s1, s1, -1 -; GFX11-TRUE16-NEXT: .LBB2_10: ; %bb17 -; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_10 -; GFX11-TRUE16-NEXT: ; %bb.11: ; %Flow6 -; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 -; GFX11-TRUE16-NEXT: .LBB2_12: ; %Flow11 -; GFX11-TRUE16-NEXT: s_and_b32 s20, s0, exec_lo -; GFX11-TRUE16-NEXT: s_or_not1_b32 s0, s18, exec_lo -; GFX11-TRUE16-NEXT: .LBB2_13: ; %Flow9 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s3, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_15 -; GFX11-TRUE16-NEXT: ; %bb.14: ; %bb43 -; GFX11-TRUE16-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-TRUE16-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] -; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s14 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s15 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_or_b32 s20, s20, exec_lo -; GFX11-TRUE16-NEXT: .LBB2_15: ; %Flow14 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s20 -; GFX11-TRUE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock -; GFX11-TRUE16-NEXT: ; divergent unreachable -; GFX11-TRUE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock -; GFX11-TRUE16-NEXT: s_endpgm -; GFX11-FAKE16-LABEL: f2: -; GFX11-FAKE16: ; %bb.0: ; %bb -; GFX11-FAKE16-NEXT: s_mov_b64 s[16:17], s[4:5] -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s19, s[16:17], 0x24 -; GFX11-FAKE16-NEXT: s_mov_b32 s12, s13 -; GFX11-FAKE16-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-FAKE16-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-FAKE16-NEXT: s_mov_b32 s20, 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo -; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mul_lo_u32 v0, s19, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_13 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb14 -; GFX11-FAKE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c -; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s24, -1, 0 -; GFX11-FAKE16-NEXT: s_bitcmp0_b32 s21, 0 -; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB2_3 -; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb15 -; GFX11-FAKE16-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-FAKE16-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] -; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s14 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-FAKE16-NEXT: s_mov_b32 s21, s14 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s15 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s21 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-FAKE16-NEXT: s_branch .LBB2_12 -; GFX11-FAKE16-NEXT: .LBB2_3: -; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB2_12 -; GFX11-FAKE16-NEXT: .LBB2_4: ; %bb16 -; GFX11-FAKE16-NEXT: s_load_b32 s0, s[16:17], 0x54 -; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s23, 1 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s1, 0 -; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB2_8 -; GFX11-FAKE16-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-FAKE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mul_hi_u32 s0, s29, s28 -; GFX11-FAKE16-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, s30 -; GFX11-FAKE16-NEXT: s_mul_i32 s0, s0, s22 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_mul_i32 s0, s0, s20 -; GFX11-FAKE16-NEXT: s_or_b32 s0, s19, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, s1 -; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-FAKE16-NEXT: .p2align 6 -; GFX11-FAKE16-NEXT: .LBB2_6: ; %bb18 -; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s8, s1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-FAKE16-NEXT: s_and_b32 s13, 0xffff, s0 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 1 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-FAKE16-NEXT: s_or_b32 s19, s0, 0x100 -; GFX11-FAKE16-NEXT: s_and_b32 s13, 1, s13 -; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s13, 1 -; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s19, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_6 -; GFX11-FAKE16-NEXT: ; %bb.7: ; %Flow -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: .LBB2_8: ; %Flow12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_12 -; GFX11-FAKE16-NEXT: ; %bb.9: -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s8, -1 -; GFX11-FAKE16-NEXT: .LBB2_10: ; %bb17 -; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_10 -; GFX11-FAKE16-NEXT: ; %bb.11: ; %Flow6 -; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 -; GFX11-FAKE16-NEXT: .LBB2_12: ; %Flow11 -; GFX11-FAKE16-NEXT: s_and_b32 s20, s2, exec_lo -; GFX11-FAKE16-NEXT: s_or_not1_b32 s0, s18, exec_lo -; GFX11-FAKE16-NEXT: .LBB2_13: ; %Flow9 -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s3, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_15 -; GFX11-FAKE16-NEXT: ; %bb.14: ; %bb43 -; GFX11-FAKE16-NEXT: s_add_u32 s8, s16, 0x58 -; GFX11-FAKE16-NEXT: s_addc_u32 s9, s17, 0 -; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] -; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 -; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s14 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s15 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-FAKE16-NEXT: s_or_b32 s20, s20, exec_lo -; GFX11-FAKE16-NEXT: .LBB2_15: ; %Flow14 -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s20 -; GFX11-FAKE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock -; GFX11-FAKE16-NEXT: ; divergent unreachable -; GFX11-FAKE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock -; GFX11-FAKE16-NEXT: s_endpgm - +; GFX11-LABEL: f2: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-NEXT: s_mov_b32 s12, s13 +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-NEXT: ; %bb.1: ; %bb14 +; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c +; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitcmp1_b32 s21, 0 +; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_bitcmp0_b32 s21, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX11-NEXT: ; %bb.2: ; %bb15 +; GFX11-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s21, s14 +; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s14, s21 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-NEXT: s_branch .LBB2_12 +; GFX11-NEXT: .LBB2_3: +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 +; GFX11-NEXT: .LBB2_4: ; %bb16 +; GFX11-NEXT: s_load_b32 s0, s[16:17], 0x54 +; GFX11-NEXT: s_bitcmp1_b32 s23, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_and_b32 s1, s23, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX11-NEXT: ; %bb.5: ; %bb18.preheader +; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 +; GFX11-NEXT: s_mul_i32 s1, s29, s28 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_or_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s30 +; GFX11-NEXT: s_mul_i32 s0, s0, s22 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s0, s0, s20 +; GFX11-NEXT: s_or_b32 s0, s19, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB2_6: ; %bb18 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s1, s19, s13 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: s_and_b32 s20, s9, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s13, s19, s13 +; GFX11-NEXT: s_bitcmp1_b32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s13, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB2_6 +; GFX11-NEXT: ; %bb.7: ; %Flow +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB2_8: ; %Flow12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB2_12 +; GFX11-NEXT: ; %bb.9: +; GFX11-NEXT: s_xor_b32 s0, s8, -1 +; GFX11-NEXT: .LBB2_10: ; %bb17 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB2_10 +; GFX11-NEXT: ; %bb.11: ; %Flow6 +; GFX11-NEXT: s_mov_b32 s18, -1 +; GFX11-NEXT: .LBB2_12: ; %Flow11 +; GFX11-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo +; GFX11-NEXT: .LBB2_13: ; %Flow9 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s3, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_15 +; GFX11-NEXT: ; %bb.14: ; %bb43 +; GFX11-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s17, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 +; GFX11-NEXT: s_mov_b32 s13, s14 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-NEXT: .LBB2_15: ; %Flow14 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GFX11-NEXT: ; divergent unreachable +; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-NEXT: s_endpgm bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i12 = mul i32 %arg, %i From fe28221eb8d9b80f94523bcb2ac06efb060a18da Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 15 Jul 2025 08:49:33 -0500 Subject: [PATCH 29/29] Replace target-specific function names with target-independent names (Remove references to source modifiers). --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 85585472881a6..af1053ea446f6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -602,7 +602,7 @@ namespace { SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldSelectOfBinops(SDNode *N); - SDValue foldSelectOfSourceMods(SDNode *N); + SDValue bitmaskOperandsToSignInstructions(SDNode *N); SDValue foldSextSetcc(SDNode *N); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL); @@ -12176,7 +12176,9 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } -static SDValue getBitwiseToSrcModifierOp(SDValue N, SelectionDAG &DAG) { +// Replace bitwise operations that modify the sign bit of integers +// with FABS and FNEG. +static SDValue getBitMaskToInstruction(SDValue N, SelectionDAG &DAG) { unsigned Opc = N.getNode()->getOpcode(); if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR) @@ -12223,13 +12225,13 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N, SelectionDAG &DAG) { return SDValue(); } -SDValue DAGCombiner::foldSelectOfSourceMods(SDNode *N) { +SDValue DAGCombiner::bitmaskOperandsToSignInstructions(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); - SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1, DAG); - SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2, DAG); + SDValue SrcModN1 = getBitMaskToInstruction(N1, DAG); + SDValue SrcModN2 = getBitMaskToInstruction(N2, DAG); if (SrcModN1 || SrcModN2) { SDLoc SL(N); EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType(); @@ -12456,9 +12458,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG)) return R; - // Identify bitmask operations that are source mods and create - // the relevant fneg, fabs or fneg+fabs. - if (SDValue F = foldSelectOfSourceMods(N)) + // Identify bitmask operations that modify only the sign bit + // and replace with FNEG or FABS as appropriate. + if (SDValue F = bitmaskOperandsToSignInstructions(N)) return F; return SDValue();