From c6c2bb77fdfcfe0bbb4c7787bdb2057b1ccb45ee Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 16 Jul 2025 09:41:01 -0500 Subject: [PATCH 01/25] [AMDGPU] Recognise bitmask operations as srcmods Add to the VOP patterns to recognise when or/xor/and are modifying only the sign bit and replace with the appropriate srcmod. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 32 + .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 41 +- .../AMDGPU/integer-select-src-modifiers.ll | 1011 +++++++++++++++++ llvm/test/CodeGen/AMDGPU/saddsat.ll | 52 +- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 378 +++--- 5 files changed, 1232 insertions(+), 282 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 00c7f0eb6e9f1..fe0e7eb279486 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3059,6 +3059,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } + // Convert various sign-bit masks to src mods. Currently disabled for 16-bit + // types as the codegen replaces the operand without adding a srcmod. + // This is intentionally finding the cases where we are performing float neg + // and abs on int types, the goal is not to obtain two's complement neg or + // abs. + // TODO: Add 16-bit support. + unsigned Opc = Src->getOpcode(); + EVT VT = Src.getValueType(); + if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || + (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64)) + return true; + + ConstantSDNode *CRHS = dyn_cast(Src->getOperand(1)); + if (!CRHS) + return true; + + // Recognise (xor a, 0x80000000) as NEG SrcMod. + // Recognise (and a, 0x7fffffff) as ABS SrcMod. + // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers. + if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } else if (Opc == ISD::AND && AllowAbs && + CRHS->getAPIntValue().isMaxSignedValue()) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) { + Mods |= SISrcMods::ABS; + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 1b092b283290a..5674ae328406d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: select_fneg_xor_select_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i32 %arg0, -2147483648 %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0 @@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: select_fneg_xor_select_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg0 = xor i64 %arg0, 9223372036854775808 %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll new file mode 100644 index 0000000000000..b3c7ac80dd014 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll @@ -0,0 +1,1011 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + +define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_select_i32_both: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i32_both: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %neg.b = xor i32 %b, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %neg.b + ret i32 %select +} + +define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_1_fabs_2_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_1_fabs_2_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %abs.b = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %abs.b + ret i32 %select +} + +define i32 @s_fneg_select_i32_1(i32 inreg %cond, i32 inreg %a, i32 inreg %b) { +; GCN-LABEL: s_fneg_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_xor_b32 s4, s17, 0x80000000 +; GCN-NEXT: s_cmp_eq_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_select_i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s1, s1, 0x80000000 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define i32 @s_fneg_1_fabs_2_select_i32(i32 inreg %cond, i32 %a, i32 %b) { +; GCN-LABEL: s_fneg_1_fabs_2_select_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s16, 0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, -v0, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_1_fabs_2_select_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v0|, -v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i32 %a, u0x80000000 + %abs.b = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %abs.b + ret i32 %select +} + +define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_v2i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fabs_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fabs_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i32 %a, u0x7fffffff + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define <2 x i32> @fneg_1_fabs_2_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_1_fabs_2_select_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_1_fabs_2_select_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %abs.b = and <2 x i32> %a, splat (i32 u0x7fffffff) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %abs.b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_fabs_select_i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %neg.a, i32 %b + ret i32 %select +} + +define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) { +; GCN-LABEL: fneg_fabs_select_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i32 %a, u0x80000000 + %cmp = icmp eq i32 %cond, zeroinitializer + %select = select i1 %cmp, i32 %b, i32 %neg.a + ret i32 %select +} + +define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_fabs_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_v2i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) { +; GCN-LABEL: fneg_fabs_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + + +define <2 x i32> @s_fneg_select_v2i32_1(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) { +; GCN-LABEL: s_fneg_select_v2i32_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_xor_b32 s4, s19, 0x80000000 +; GCN-NEXT: s_xor_b32 s5, s18, 0x80000000 +; GCN-NEXT: s_cmp_eq_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s5, s5, s20 +; GCN-NEXT: s_cmp_eq_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s21 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_select_v2i32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, s2, s16 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, s3, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b + ret <2 x i32> %select +} + +define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) { +; GCN-LABEL: s_fneg_fabs_select_v2i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_bitset1_b32 s19, 31 +; GCN-NEXT: s_bitset1_b32 s18, 31 +; GCN-NEXT: s_cmp_eq_u32 s16, 0 +; GCN-NEXT: s_cselect_b32 s4, s20, s18 +; GCN-NEXT: s_cmp_eq_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s5, s21, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_fabs_select_v2i32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, s16, s2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, s17, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or <2 x i32> %a, splat (i32 u0x80000000) + %cmp = icmp eq <2 x i32> %cond, zeroinitializer + %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a + ret <2 x i32> %select +} + +define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_1_fabs_2_select_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_1_fabs_2_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %abs.b = and i64 %b, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %abs.b + ret i64 %select +} + +define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fabs_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fabs_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_fabs_select_i64_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) { +; GCN-LABEL: fneg_fabs_select_i64_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fneg_fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @s_fneg_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fneg_select_i64_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s18, s20 +; GFX7-NEXT: s_cselect_b32 s5, s6, s21 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fneg_select_i64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s5, s18, s20 +; GFX9-NEXT: s_cselect_b32 s4, s4, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s2, s16 +; GFX11-NEXT: s_cselect_b32 s1, s3, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @s_fneg_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fneg_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s20, s18 +; GFX7-NEXT: s_cselect_b32 s5, s21, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fneg_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s5, s20, s18 +; GFX9-NEXT: s_cselect_b32 s4, s21, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s16, s2 +; GFX11-NEXT: s_cselect_b32 s1, s17, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @s_fneg_1_fabs_2_select_i64(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fneg_1_fabs_2_select_i64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000 +; GFX7-NEXT: s_bitset0_b32 s21, 31 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s18, s20 +; GFX7-NEXT: s_cselect_b32 s5, s6, s21 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fneg_1_fabs_2_select_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000 +; GFX9-NEXT: s_bitset0_b32 s21, 31 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s5, s18, s20 +; GFX9-NEXT: s_cselect_b32 s4, s4, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_1_fabs_2_select_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_bitset0_b32 s17, 31 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s2, s16 +; GFX11-NEXT: s_cselect_b32 s1, s3, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i64 %a, u0x8000000000000000 + %abs.b = and i64 %b, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %abs.b + ret i64 %select +} + +define i64 @s_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fabs_select_i64_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_bitset0_b32 s19, 31 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s18, s20 +; GFX7-NEXT: s_cselect_b32 s5, s19, s21 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fabs_select_i64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bitset0_b32 s19, 31 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s4, s18, s20 +; GFX9-NEXT: s_cselect_b32 s5, s19, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fabs_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s2, s16 +; GFX11-NEXT: s_cselect_b32 s1, s3, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @s_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fabs_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_bitset0_b32 s19, 31 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s20, s18 +; GFX7-NEXT: s_cselect_b32 s5, s21, s19 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fabs_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bitset0_b32 s19, 31 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s4, s20, s18 +; GFX9-NEXT: s_cselect_b32 s5, s21, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s16, s2 +; GFX11-NEXT: s_cselect_b32 s1, s17, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = and i64 %a, u0x7fffffffffffffff + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i64 @s_fneg_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fneg_fabs_select_i64_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_bitset1_b32 s19, 31 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s18, s20 +; GFX7-NEXT: s_cselect_b32 s5, s19, s21 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fneg_fabs_select_i64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bitset1_b32 s19, 31 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s4, s18, s20 +; GFX9-NEXT: s_cselect_b32 s5, s19, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_fabs_select_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s2, s16 +; GFX11-NEXT: s_cselect_b32 s1, s3, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %neg.a, i64 %b + ret i64 %select +} + +define i64 @s_fneg_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) { +; GFX7-LABEL: s_fneg_fabs_select_i64_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0 +; GFX7-NEXT: s_bitset1_b32 s19, 31 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s20, s18 +; GFX7-NEXT: s_cselect_b32 s5, s21, s19 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_fneg_fabs_select_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_bitset1_b32 s19, 31 +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s4, s20, s18 +; GFX9-NEXT: s_cselect_b32 s5, s21, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_fneg_fabs_select_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0 +; GFX11-NEXT: s_cselect_b32 s0, s16, s2 +; GFX11-NEXT: s_cselect_b32 s1, s17, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %neg.a = or i64 %a, u0x8000000000000000 + %cmp = icmp eq i64 %cond, zeroinitializer + %select = select i1 %cmp, i64 %b, i64 %neg.a + ret i64 %select +} + +define i16 @fneg_select_i16_1(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %b + ret i16 %select +} + +define i16 @fneg_select_i16_2(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_2: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_2: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %b, i16 %neg.a + ret i16 %select +} + +define i16 @fneg_select_i16_both(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_select_i16_both: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_select_i16_both: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_select_i16_both: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_select_i16_both: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %neg.b = xor i16 %b, u0x8000 + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %neg.b + ret i16 %select +} + +define i16 @fneg_1_fabs_2_select_i16(i16 %cond, i16 %a, i16 %b) { +; GFX7-LABEL: fneg_1_fabs_2_select_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: fneg_1_fabs_2_select_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fneg_1_fabs_2_select_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_1_fabs_2_select_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %neg.a = xor i16 %a, u0x8000 + %abs.b = and i16 %a, u0x7fff + %cmp = icmp eq i16 %cond, zeroinitializer + %select = select i1 %cmp, i16 %neg.a, i16 %abs.b + ret i16 %select +} diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 4e27cf20d3c98..c52f7a4ac720a 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -124,9 +124,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i32: @@ -136,9 +135,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i32: @@ -383,16 +381,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i32: @@ -402,16 +398,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i32: @@ -442,8 +436,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i64: @@ -456,8 +449,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i64: @@ -470,8 +462,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i64: @@ -480,12 +471,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_i64: @@ -494,11 +484,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 40d80f5e83e36..09c0e775f783d 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -124,9 +124,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i32: @@ -136,9 +135,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i32: @@ -383,16 +381,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i32: @@ -402,16 +398,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i32: @@ -439,23 +433,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i32: @@ -465,23 +456,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i32: @@ -511,30 +499,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i32: @@ -544,30 +528,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i32: @@ -599,58 +579,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v8i32: @@ -660,58 +632,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i32: @@ -751,116 +715,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: @@ -870,116 +818,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5] ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5] ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v16i32: @@ -1066,8 +998,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i64: @@ -1080,8 +1011,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i64: @@ -1094,8 +1024,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i64: @@ -1104,12 +1033,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_i64: @@ -1118,11 +1046,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result From d0186bac556b4c500b15c100f051d9e8f183a749 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 20 May 2025 05:14:36 -0500 Subject: [PATCH 02/25] [AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 146 +++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 95 ++- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 47 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 15 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 +- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 20 +- llvm/test/CodeGen/AMDGPU/bfi_int.ll | 4 +- .../AMDGPU/copysign-simplify-demanded-bits.ll | 4 +- .../AMDGPU/dag-preserve-disjoint-flag.ll | 36 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 16 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 188 ++--- llvm/test/CodeGen/AMDGPU/or.ll | 677 +++++++++++++++++- llvm/test/CodeGen/AMDGPU/rotr.ll | 128 ++++ .../CodeGen/AMDGPU/vector_range_metadata.ll | 8 +- llvm/test/CodeGen/AMDGPU/xor.ll | 630 +++++++++++++++- 16 files changed, 1805 insertions(+), 218 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e512b3b..779ce2daf53fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4032,9 +4032,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( - DAGCombinerInfo &DCI, const SDLoc &SL, - unsigned Opc, SDValue LHS, - uint32_t ValLo, uint32_t ValHi) const { + DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const { SelectionDAG &DAG = DCI.DAG; SDValue Lo, Hi; std::tie(Lo, Hi) = split64BitValue(LHS, DAG); @@ -4063,6 +4062,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; + // When the shl64_reduce optimisation code is passed through vector + // legalization some scalarising occurs. After ISD::AND was legalised, this + // resulted in the AND instructions no longer being elided, as mentioned + // below. The following code should make sure this takes place. + if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue VAND = RHS.getOperand(0); + if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { + uint64_t AndIndex = RHS->getConstantOperandVal(1); + if (VAND->getOpcode() == ISD::AND && CRRHS) { + SDValue LHSAND = VAND.getOperand(0); + SDValue RHSAND = VAND.getOperand(1); + if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) { + // Part of shlcombine is to optimise for the case where its possible + // to reduce shl64 to shl32 if shift range is [63-32]. This + // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The + // '&' is then elided by ISel. The vector code for this was being + // completely scalarised by the vector legalizer, but now v2i32 is + // made legal the vector legaliser only partially scalarises the + // vector operations and the and was not elided. This check enables us + // to locate and scalarise the v2i32 and and re-enable ISel to elide + // the and instruction. + ConstantSDNode *CANDL = + dyn_cast(RHSAND->getOperand(0)); + ConstantSDNode *CANDR = + dyn_cast(RHSAND->getOperand(1)); + if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f && + RHSAND->getConstantOperandVal(1) == 0x1f) { + // Get the non-const AND operands and produce scalar AND + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, + LHSAND, Zero); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); + SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + if (AndIndex == 0 || AndIndex == 1) + return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, + AndIndex == 0 ? LoAnd : HiAnd, N->getFlags()); + } + } + } + } + } + unsigned RHSVal; if (CRHS) { RHSVal = CRHS->getZExtValue(); @@ -4104,8 +4150,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, if (VT.getScalarType() != MVT::i64) return SDValue(); - // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) - // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and // the same code size. @@ -4267,6 +4311,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, SDLoc SL(N); unsigned RHSVal; + // When the shl64_reduce optimisation code is passed through vector + // legalization some scalarising occurs. After ISD::AND was legalised, this + // resulted in the AND instructions no longer being elided, as mentioned + // below. The following code should make sure this takes place. + if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue VAND = RHS.getOperand(0); + if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { + uint64_t AndIndex = RHS->getConstantOperandVal(1); + if (VAND->getOpcode() == ISD::AND && CRRHS) { + SDValue LHSAND = VAND.getOperand(0); + SDValue RHSAND = VAND.getOperand(1); + if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) { + // Part of srlcombine is to optimise for the case where its possible + // to reduce shl64 to shl32 if shift range is [63-32]. This + // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The + // '&' is then elided by ISel. The vector code for this was being + // completely scalarised by the vector legalizer, but now v2i32 is + // made legal the vector legaliser only partially scalarises the + // vector operations and the and was not elided. This check enables us + // to locate and scalarise the v2i32 and and re-enable ISel to elide + // the and instruction. + ConstantSDNode *CANDL = + dyn_cast(RHSAND->getOperand(0)); + ConstantSDNode *CANDR = + dyn_cast(RHSAND->getOperand(1)); + if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f && + RHSAND->getConstantOperandVal(1) == 0x1f) { + // Get the non-const AND operands and produce scalar AND + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, + LHSAND, Zero); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); + SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + if (AndIndex == 0 || AndIndex == 1) + return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc, + AndIndex == 0 ? LoAnd : HiAnd, N->getFlags()); + } + } + } + } + } + if (CRHS) { RHSVal = CRHS->getZExtValue(); @@ -4780,8 +4871,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) return SDValue(); - return distributeOpThroughSelect(DCI, LHS.getOpcode(), - SDLoc(N), Cond, LHS, RHS); + // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be + // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled + // out in this case. For now I've made the logic as specific to the case as + // possible, hopefully this can be relaxed in future. + if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) { + SDValue LHSB = LHS.getOperand(0); + SDValue RHSB = RHS.getOperand(0); + if (LHSB.getOpcode() == ISD::BITCAST && + RHSB->getOpcode() == ISD::BITCAST) { + EVT LHSBOpTy = LHSB->getOperand(0).getValueType(); + EVT RHSBOpTy = RHSB->getOperand(0).getValueType(); + if (LHSB.getValueType() == MVT::f32 && + RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 && + RHSBOpTy == MVT::i32) + return SDValue(); + } + } + + return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, + RHS); } bool Inv = false; @@ -4834,8 +4943,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (Inv) std::swap(NewLHS, NewRHS); - SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, - Cond, NewLHS, NewRHS); + SDValue NewSelect = + DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS); DCI.AddToWorklist(NewSelect.getNode()); return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); } @@ -5256,8 +5365,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, } case ISD::SELECT: { // fneg (select c, a, b) -> select c, (fneg a), (fneg b) + // This combine became necessary recently to prevent a regression in + // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor. + // Specifically, additional instructions were added to the final codegen. + // When adding this combine a case was added to performFNEGCombine to + // prevent this combine from being undone under certain conditions. // TODO: Invert conditions of foldFreeOpFromSelect - return SDValue(); + SDValue Cond = N0.getOperand(0); + SDValue LHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + EVT LHVT = LHS.getValueType(); + EVT RHVT = RHS.getValueType(); + // The regression was limited to i32 v2/i32. + if (RHVT != MVT::i32 && LHVT != MVT::i32) + return SDValue(); + + SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS); + SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS); + SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg); + return Op; } case ISD::BITCAST: { SDLoc SL(N); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bc0fd8d4e814b..e2162bb56ff08 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); } + setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal); + // Prevent SELECT v2i32 from being implemented with the above bitwise ops and + // instead lower to cndmask in SITargetLowering::LowerSELECT(). + setOperationAction(ISD::SELECT, MVT::v2i32, Custom); + // Enable MatchRotate to produce ISD::ROTR, which is later transformed to + // alignbit. + setOperationAction(ISD::ROTR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, Custom); @@ -6079,6 +6087,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +// Enable lowering of ROTR for vxi32 types. This is a workaround for a +// regression whereby extra unnecessary instructions were added to codegen +// for rotr operations, casued by legalising v2i32 or. This resulted in extra +// instructions to extract the result from the vector. +SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const { + [[maybe_unused]] EVT VT = Op.getValueType(); + + assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 || + VT == MVT::v16i32) && + "Unexpected ValueType."); + + return DAG.UnrollVectorOp(Op.getNode()); +} + // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the // wider vector type is legal. SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, @@ -6270,6 +6292,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerGET_FPENV(Op, DAG); case ISD::SET_FPENV: return lowerSET_FPENV(Op, DAG); + case ISD::ROTR: + return lowerROTR(Op, DAG); } return SDValue(); } @@ -13252,6 +13276,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } + // Detect identity v2i32 OR and replace with identity source node. + // Specifically an Or that has operands constructed from the same source node + // via extract_vector_elt and build_vector. I.E. + // v2i32 or( + // v2i32 build_vector( + // i32 extract_elt(%IdentitySrc, 0), + // i32 0 + // ), + // v2i32 build_vector( + // i32 0, + // i32 extract_elt(%IdentitySrc, 1) + // ) ) + // => + // v2i32 %IdentitySrc + + if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR && + RHS->getOpcode() == ISD::BUILD_VECTOR) { + + ConstantSDNode *LC = dyn_cast(LHS->getOperand(1)); + ConstantSDNode *RC = dyn_cast(RHS->getOperand(0)); + + // Test for and normalise build vectors. + if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) { + + // Get the extract_vector_element operands. + SDValue LEVE = LHS->getOperand(0); + SDValue REVE = RHS->getOperand(1); + + if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // Check that different elements from the same vector are + // extracted. + if (LEVE->getOperand(0) == REVE->getOperand(0) && + LEVE->getOperand(1) != REVE->getOperand(1)) { + SDValue IdentitySrc = LEVE.getOperand(0); + return IdentitySrc; + } + } + } + } + if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) return SDValue(); @@ -13296,13 +13361,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) return RV; + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) { + + const ConstantSDNode *CRHS0 = dyn_cast(RHS.getOperand(0)); + const ConstantSDNode *CRHS1 = dyn_cast(RHS.getOperand(1)); + SDValue LHS_0 = LHS.getOperand(0); + SDValue LHS_1 = LHS.getOperand(1); + + if (LHS.getOpcode() == ISD::VSELECT && CRHS0 && + CRHS0->getAPIntValue().isSignMask() && + shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 && + CRHS1->getAPIntValue().isSignMask() && + shouldFoldFNegIntoSrc(N, LHS_1)) { + + SDLoc DL(N); + SDValue CastLHS = + DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1)); + SDValue CastRHS = + DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2)); + SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS); + SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS); + SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32, + LHS->getOperand(0), FNegLHS, FNegRHS); + return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); + } + } + const ConstantSDNode *CRHS = dyn_cast(RHS); - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); if (CRHS && VT == MVT::i64) { if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158572a4d..894e38dedacf4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -443,6 +443,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f95c618..4dc94afbcb7b5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1903,7 +1903,6 @@ def : GCNPat < >; } - /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ @@ -2457,9 +2456,9 @@ def : AMDGPUPatIgnoreCopies < (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; -// 64-bit version +foreach vt = [i64, v2i32] in { def : AMDGPUPatIgnoreCopies < - (DivergentBinFrag i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), + (DivergentBinFrag vt:$z, (and vt:$x, (xor vt:$y, vt:$z))), (REG_SEQUENCE VReg_64, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), @@ -2468,6 +2467,7 @@ def : AMDGPUPatIgnoreCopies < (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; +} def : AMDGPUPat < (fcopysign f32:$src0, f32:$src1), @@ -2541,30 +2541,25 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - (EXTRACT_SUBREG $src1, lo16), - /* clamp */ 0, /* op_sel */ 0) ->; - -def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), - (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), - 0, /* src1_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), - 0, /* src2_modifiers */ - (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), - /* clamp */ 0, /* op_sel */ 0)>; + def : GCNPat<(rotr i32:$src0, i32:$src1), + (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src0, + /* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16), + /* clamp */ 0, /* op_sel */ 0)>; -def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src1, - /* src2_modifiers */ 0, - (EXTRACT_SUBREG VGPR_32:$src2, lo16), - /* clamp */ 0, /* op_sel */ 0)>; + def : GCNPat< + (i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), + (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ + (i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */ + (i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */ + (i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)), + /* clamp */ 0, /* op_sel */ 0)>; + + def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), + (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src1, + /* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16), + /* clamp */ 0, /* op_sel */ 0)>; } // end True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc2f00e6..77fcca160c0b9 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1828,6 +1828,21 @@ def : GCNPat < (S_MOV_B32 imm:$imm) >; +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_AND_B64 SReg_64:$x, SReg_64:$y) +>; + +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_OR_B64 SReg_64:$x, SReg_64:$y) +>; + +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_XOR_B64 SReg_64:$x, SReg_64:$y) +>; + // Same as a 32-bit inreg def : GCNPat< (i32 (UniformUnaryFrag i16:$src)), diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 030a6e1e978c1..8b1d92d8ebca3 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1015,9 +1015,9 @@ def : DivergentClampingBinOp; def : DivergentBinOp; def : DivergentBinOp; -class divergent_i64_BinOp : +class divergent_i64_BinOp : GCNPat< - (DivergentBinFrag i64:$src0, i64:$src1), + (DivergentBinFrag vt:$src0, vt:$src1), (REG_SEQUENCE VReg_64, (Inst (i32 (EXTRACT_SUBREG $src0, sub0)), @@ -1034,6 +1034,10 @@ def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; + // mul24 w/ 64 bit output. class mul24_64_Pat : GCNPat< (i64 (Op i32:$src0, i32:$src1)), diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 5b4866c386793..ca8f7736f6093 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GFX-950: ; %bb.0: ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3] +; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6 +; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]| -; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5] +; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] ; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] ; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2 ; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1] ; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 -; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5 +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| -; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6 +; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3] ; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0 +; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0 ; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog %res = fptrunc <2 x double> %src to <2 x bfloat> diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index b372dec383344..987555fbaaafb 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> % ; GFX7-LABEL: v_bitselect_v2i32_pat1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bitselect_v2i32_pat1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_bitselect_v2i32_pat1: diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 021104114d796..f5227eed458d6 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 @@ -126,8 +126,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 +; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT: s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll index d63a36c4b2958..7e2e8b577e085 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll @@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 @@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] - ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]] + ; CHECK-NEXT: $vgpr1 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 5674ae328406d..94f41097b7aa1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1634,12 +1634,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_cselect_b32 s1, s1, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1658,10 +1658,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc ; GFX9-NEXT: s_cselect_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -1672,17 +1672,17 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x18 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_bitcmp1_b32 s6, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo ; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s1, s1, s3 ; GFX11-NEXT: s_cselect_b32 s0, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 4a79096442c96..7afd99ddb0ef6 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -2010,61 +2010,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab -; SI-NEXT: v_mul_hi_u32 v6, v6, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; SI-NEXT: v_mul_hi_u32 v6, v4, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SI-NEXT: v_mul_hi_u32 v6, v7, s4 +; SI-NEXT: v_mul_hi_u32 v6, v5, s4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; SI-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab -; VI-NEXT: v_mul_hi_u32 v6, v6, s4 -; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; VI-NEXT: v_mul_hi_u32 v6, v4, s4 +; VI-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_mul_hi_u32 v6, v7, s4 +; VI-NEXT: v_mul_hi_u32 v6, v5, s4 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; VI-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab -; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 -; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: @@ -2075,12 +2075,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-LABEL: v_fshr_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 @@ -2091,109 +2091,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: v_fshr_v2i24: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: v_fshr_v2i24: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-TRUE16-LABEL: v_fshr_v2i24: -; GFX12-TRUE16: ; %bb.0: -; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l -; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-FAKE16-LABEL: v_fshr_v2i24: -; GFX12-FAKE16: ; %bb.0: -; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_fshr_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 1abd2e6b60f2f..26751b289a385 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1,8 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6S %s +; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s +;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_or_b64, particularly in the v2i32 case. See SWDEV-517886. +;; Also removed the previously unused "GCN" check-prefixes from the test. + define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: @@ -18,8 +23,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -37,11 +42,39 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: or_v2i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s5 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: or_v2i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s5 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: or_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -112,6 +145,44 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: or_v4i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX6S-NEXT: s_mov_b32 s11, 0xf000 +; GFX6S-NEXT: s_mov_b32 s10, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s3, s3, s7 +; GFX6S-NEXT: s_or_b32 s2, s2, s6 +; GFX6S-NEXT: s_or_b32 s1, s1, s5 +; GFX6S-NEXT: s_or_b32 s0, s0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: v_mov_b32_e32 v1, s1 +; GFX6S-NEXT: v_mov_b32_e32 v2, s2 +; GFX6S-NEXT: v_mov_b32_e32 v3, s3 +; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: or_v4i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8S-NEXT: s_mov_b32 s11, 0xf000 +; GFX8S-NEXT: s_mov_b32 s10, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s3, s3, s7 +; GFX8S-NEXT: s_or_b32 s2, s2, s6 +; GFX8S-NEXT: s_or_b32 s1, s1, s5 +; GFX8S-NEXT: s_or_b32 s0, s0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: v_mov_b32_e32 v1, s1 +; GFX8S-NEXT: v_mov_b32_e32 v2, s2 +; GFX8S-NEXT: v_mov_b32_e32 v3, s3 +; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: or_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -167,6 +238,32 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s7, 0xf000 +; GFX6S-NEXT: s_mov_b32 s6, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_mov_b32 s4, s0 +; GFX6S-NEXT: s_or_b32 s0, s2, s3 +; GFX6S-NEXT: s_mov_b32 s5, s1 +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s7, 0xf000 +; GFX8S-NEXT: s_mov_b32 s6, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_mov_b32 s4, s0 +; GFX8S-NEXT: s_or_b32 s0, s2, s3 +; GFX8S-NEXT: s_mov_b32 s5, s1 +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -221,6 +318,34 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dword s4, s[4:5], 0xd +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s5, s4 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dword s4, s[4:5], 0x34 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s5, s4 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -268,6 +393,30 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_literal_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s6, 0x1869f +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_literal_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s6, 0x1869f +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -312,6 +461,34 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_literal_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s7, 0xf237b +; GFX6S-NEXT: s_or_b32 s5, s6, 0x3039 +; GFX6S-NEXT: v_mov_b32_e32 v0, s5 +; GFX6S-NEXT: v_mov_b32_e32 v1, s4 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_literal_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s7, 0xf237b +; GFX8S-NEXT: s_or_b32 s5, s6, 0x3039 +; GFX8S-NEXT: v_mov_b32_e32 v0, s5 +; GFX8S-NEXT: v_mov_b32_e32 v1, s4 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_literal_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -375,6 +552,51 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_literal_multi_use_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d +; GFX6S-NEXT: s_movk_i32 s8, 0x3039 +; GFX6S-NEXT: s_mov_b32 s9, 0xf237b +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6S-NEXT: v_mov_b32_e32 v0, s6 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: v_mov_b32_e32 v1, s7 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_add_u32 s0, s4, 0x3039 +; GFX6S-NEXT: s_addc_u32 s1, s5, 0xf237b +; GFX6S-NEXT: s_waitcnt expcnt(0) +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: v_mov_b32_e32 v1, s1 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_waitcnt vmcnt(0) +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_literal_multi_use_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 +; GFX8S-NEXT: s_movk_i32 s8, 0x3039 +; GFX8S-NEXT: s_mov_b32 s9, 0xf237b +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8S-NEXT: v_mov_b32_e32 v0, s6 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: v_mov_b32_e32 v1, s7 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_add_u32 s0, s4, 0x3039 +; GFX8S-NEXT: s_addc_u32 s1, s5, 0xf237b +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: v_mov_b32_e32 v1, s1 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_waitcnt vmcnt(0) +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_literal_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] @@ -432,6 +654,32 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_inline_imm_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s6, 63 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s7 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_inline_imm_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s6, 63 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s7 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -492,6 +740,49 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6S-NEXT: s_mov_b32 s7, 0xf000 +; GFX6S-NEXT: s_mov_b32 s6, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_mov_b32 s4, s0 +; GFX6S-NEXT: s_or_b32 s0, s2, 63 +; GFX6S-NEXT: s_mov_b32 s5, s1 +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: v_mov_b32_e32 v1, s3 +; GFX6S-NEXT: s_add_u32 s0, s8, 63 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6S-NEXT: s_addc_u32 s1, s9, 0 +; GFX6S-NEXT: s_waitcnt expcnt(0) +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: v_mov_b32_e32 v1, s1 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6S-NEXT: s_waitcnt vmcnt(0) +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8S-NEXT: s_mov_b32 s7, 0xf000 +; GFX8S-NEXT: s_mov_b32 s6, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_mov_b32 s4, s0 +; GFX8S-NEXT: s_or_b32 s0, s2, 63 +; GFX8S-NEXT: s_mov_b32 s5, s1 +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: v_mov_b32_e32 v1, s3 +; GFX8S-NEXT: s_add_u32 s0, s8, 63 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8S-NEXT: s_addc_u32 s1, s9, 0 +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: v_mov_b32_e32 v1, s1 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8S-NEXT: s_waitcnt vmcnt(0) +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_inline_imm_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] @@ -545,6 +836,32 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_neg_inline_imm_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: v_mov_b32_e32 v1, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s6, -8 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_neg_inline_imm_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: v_mov_b32_e32 v1, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s6, -8 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_neg_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -599,6 +916,32 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_literal_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s4, 0xffff +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_literal_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s4, 0xffff +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -658,6 +1001,32 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_inline_immediate_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s4, 4 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_inline_immediate_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s4, 4 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_inline_immediate_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -711,6 +1080,36 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_or_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6S-NEXT: s_mov_b32 s7, 0xf000 +; GFX6S-NEXT: s_mov_b32 s6, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_mov_b32 s4, s0 +; GFX6S-NEXT: s_mov_b32 s5, s1 +; GFX6S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6S-NEXT: v_mov_b32_e32 v0, s0 +; GFX6S-NEXT: v_mov_b32_e32 v1, s1 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_or_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8S-NEXT: s_mov_b32 s7, 0xf000 +; GFX8S-NEXT: s_mov_b32 s6, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_mov_b32 s4, s0 +; GFX8S-NEXT: s_mov_b32 s5, s1 +; GFX8S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8S-NEXT: v_mov_b32_e32 v0, s0 +; GFX8S-NEXT: v_mov_b32_e32 v1, s1 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -774,6 +1173,38 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s5 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s5 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -841,6 +1272,36 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: scalar_vector_or_i64: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s5 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: scalar_vector_or_i64: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s5 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: scalar_vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -903,6 +1364,36 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i64_loadimm: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s5, s5, 0x146f +; GFX6S-NEXT: s_or_b32 s4, s4, 0xdf77987f +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s5 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i64_loadimm: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s5, s5, 0x146f +; GFX8S-NEXT: s_or_b32 s4, s4, 0xdf77987f +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s5 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i64_loadimm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -965,6 +1456,34 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i64_imm: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s4, 8 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: v_mov_b32_e32 v1, s5 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i64_imm: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s4, 8 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: v_mov_b32_e32 v1, s5 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i64_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1026,6 +1545,34 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i64_neg_inline_imm: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: v_mov_b32_e32 v1, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s4, -8 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i64_neg_inline_imm: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: v_mov_b32_e32 v1, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s4, -8 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i64_neg_inline_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1089,6 +1636,34 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: vector_or_i64_neg_literal: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: v_mov_b32_e32 v1, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s4, 0xffffff38 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: vector_or_i64_neg_literal: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: v_mov_b32_e32 v1, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s4, 0xffffff38 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: vector_or_i64_neg_literal: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1140,6 +1715,32 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: trunc_i64_or_to_i32: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX6S-NEXT: s_load_dword s7, s[4:5], 0x1d +; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s3, 0xf000 +; GFX6S-NEXT: s_mov_b32 s2, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_or_b32 s4, s7, s6 +; GFX6S-NEXT: v_mov_b32_e32 v0, s4 +; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: trunc_i64_or_to_i32: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX8S-NEXT: s_load_dword s7, s[4:5], 0x74 +; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s3, 0xf000 +; GFX8S-NEXT: s_mov_b32 s2, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_or_b32 s4, s7, s6 +; GFX8S-NEXT: v_mov_b32_e32 v0, s4 +; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: trunc_i64_or_to_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1211,6 +1812,46 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: or_i1: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s7, 0xf000 +; GFX6S-NEXT: s_mov_b32 s6, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_load_dword s8, s[8:9], 0x0 +; GFX6S-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX6S-NEXT: s_mov_b32 s4, s0 +; GFX6S-NEXT: s_mov_b32 s5, s1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; GFX6S-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GFX6S-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX6S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: or_i1: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s7, 0xf000 +; GFX8S-NEXT: s_mov_b32 s6, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_load_dword s8, s[8:9], 0x0 +; GFX8S-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8S-NEXT: s_mov_b32 s4, s0 +; GFX8S-NEXT: s_mov_b32 s5, s1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: v_mul_f32_e64 v0, 1.0, s8 +; GFX8S-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GFX8S-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX8S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1274,6 +1915,38 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; +; GFX6S-LABEL: s_or_i1: +; GFX6S: ; %bb.0: +; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6S-NEXT: s_mov_b32 s7, 0xf000 +; GFX6S-NEXT: s_mov_b32 s6, -1 +; GFX6S-NEXT: s_waitcnt lgkmcnt(0) +; GFX6S-NEXT: s_cmp_eq_u32 s0, s1 +; GFX6S-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6S-NEXT: s_cmp_eq_u32 s2, s3 +; GFX6S-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX6S-NEXT: s_endpgm +; +; GFX8S-LABEL: s_or_i1: +; GFX8S: ; %bb.0: +; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8S-NEXT: s_mov_b32 s7, 0xf000 +; GFX8S-NEXT: s_mov_b32 s6, -1 +; GFX8S-NEXT: s_waitcnt lgkmcnt(0) +; GFX8S-NEXT: s_cmp_eq_u32 s0, s1 +; GFX8S-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8S-NEXT: s_cmp_eq_u32 s2, s3 +; GFX8S-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX8S-NEXT: s_endpgm +; ; EG-LABEL: s_or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index d6e361d6e297e..7322e2f239ee8 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -228,6 +228,134 @@ entry: ret void } +define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) { +; R600-LABEL: rotr_v8i32: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X, +; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W, +; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z, +; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X, +; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W, +; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z, +; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y, +; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; SI-LABEL: rotr_v8i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s19 +; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 +; SI-NEXT: v_mov_b32_e32 v0, s17 +; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: v_mov_b32_e32 v4, s23 +; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4 +; SI-NEXT: v_mov_b32_e32 v4, s21 +; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; GFX8-LABEL: rotr_v8i32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 +; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s23 +; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s21 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: rotr_v8i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23 +; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22 +; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21 +; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: rotr_v8i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23 +; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22 +; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21 +; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +entry: + %tmp0 = sub <8 x i32> , %y + %tmp1 = shl <8 x i32> %x, %tmp0 + %tmp2 = lshr <8 x i32> %x, %y + %tmp3 = or <8 x i32> %tmp1, %tmp2 + store <8 x i32> %tmp3, ptr addrspace(1) %in + ret void +} + declare i16 @llvm.fshr.i16(i16, i16, i16) define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) { diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll index d496634ae474f..8af4a8de7b266 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll @@ -18,11 +18,11 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) { ; CHECK-LABEL: test_add2x32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[2:3] -; CHECK-NEXT: flat_load_dword v5, v[0:1] -; CHECK-NEXT: v_mov_b32_e32 v1, 48 +; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v0, v5, v4 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v7 +; CHECK-NEXT: v_or_b32_e32 v0, v4, v6 ; CHECK-NEXT: s_setpc_b64 s[30:31] %a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{} %b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 00bb7b24786f5..3808c73ae7de3 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SIS %s +; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VIS %s + +;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_xor_b64, particularly in the v2i32 case. See +;; SWDEV-517886. +;; Also removed the previously unused "GCN" check-prefixes from the test. define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: @@ -21,8 +27,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -40,10 +46,43 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: xor_v2i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: xor_v2i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(1) %in0 %b = load <2 x i32>, ptr addrspace(1) %in1 %result = xor <2 x i32> %a, %b @@ -97,6 +136,48 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_xor_b32_e32 v0, v0, v4 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: xor_v4i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b32 s7, s7, s11 +; SIS-NEXT: s_xor_b32 s6, s6, s10 +; SIS-NEXT: s_xor_b32 s5, s5, s9 +; SIS-NEXT: s_xor_b32 s4, s4, s8 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: v_mov_b32_e32 v2, s6 +; SIS-NEXT: v_mov_b32_e32 v3, s7 +; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: xor_v4i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; VIS-NEXT: v_mov_b32_e32 v4, s0 +; VIS-NEXT: v_mov_b32_e32 v5, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s0, s7, s11 +; VIS-NEXT: s_xor_b32 s1, s6, s10 +; VIS-NEXT: s_xor_b32 s2, s5, s9 +; VIS-NEXT: s_xor_b32 s3, s4, s8 +; VIS-NEXT: v_mov_b32_e32 v0, s3 +; VIS-NEXT: v_mov_b32_e32 v1, s2 +; VIS-NEXT: v_mov_b32_e32 v2, s1 +; VIS-NEXT: v_mov_b32_e32 v3, s0 +; VIS-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VIS-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %in0 %b = load <4 x i32>, ptr addrspace(1) %in1 %result = xor <4 x i32> %a, %b @@ -152,6 +233,47 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: xor_i1: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SIS-NEXT: s_mov_b32 s7, 0xf000 +; SIS-NEXT: s_mov_b32 s6, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dword s8, s[2:3], 0x0 +; SIS-NEXT: s_load_dword s9, s[4:5], 0x0 +; SIS-NEXT: s_mov_b32 s4, s0 +; SIS-NEXT: s_mov_b32 s5, s1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s8, 0 +; SIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s9, 1.0 +; SIS-NEXT: v_mov_b32_e32 v0, s9 +; SIS-NEXT: v_mov_b32_e32 v1, s8 +; SIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: xor_i1: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dword s6, s[2:3], 0x0 +; VIS-NEXT: s_load_dword s4, s[4:5], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s6, 0 +; VIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, 1.0 +; VIS-NEXT: v_mov_b32_e32 v2, s4 +; VIS-NEXT: v_mov_b32_e32 v3, s6 +; VIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; VIS-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VIS-NEXT: flat_store_dword v[0:1], v2 +; VIS-NEXT: s_endpgm + %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -206,6 +328,50 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 ; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: v_xor_i1: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SIS-NEXT: s_mov_b32 s7, 0xf000 +; SIS-NEXT: s_mov_b32 s6, -1 +; SIS-NEXT: s_mov_b32 s14, s6 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_mov_b32 s12, s2 +; SIS-NEXT: s_mov_b32 s13, s3 +; SIS-NEXT: s_mov_b32 s15, s7 +; SIS-NEXT: s_mov_b32 s10, s6 +; SIS-NEXT: s_mov_b32 s11, s7 +; SIS-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc +; SIS-NEXT: s_waitcnt vmcnt(0) +; SIS-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc +; SIS-NEXT: s_waitcnt vmcnt(0) +; SIS-NEXT: s_mov_b32 s4, s0 +; SIS-NEXT: s_mov_b32 s5, s1 +; SIS-NEXT: v_xor_b32_e32 v0, v0, v1 +; SIS-NEXT: v_and_b32_e32 v0, 1, v0 +; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: v_xor_i1: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: v_mov_b32_e32 v0, s2 +; VIS-NEXT: v_mov_b32_e32 v1, s3 +; VIS-NEXT: v_mov_b32_e32 v2, s4 +; VIS-NEXT: v_mov_b32_e32 v3, s5 +; VIS-NEXT: flat_load_ubyte v4, v[0:1] glc +; VIS-NEXT: s_waitcnt vmcnt(0) +; VIS-NEXT: flat_load_ubyte v2, v[2:3] glc +; VIS-NEXT: s_waitcnt vmcnt(0) +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_xor_b32_e32 v2, v4, v2 +; VIS-NEXT: v_and_b32_e32 v2, 1, v2 +; VIS-NEXT: flat_store_byte v[0:1], v2 +; VIS-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %xor = xor i1 %a, %b @@ -253,6 +419,36 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_xor_i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dword s6, s[2:3], 0x0 +; SIS-NEXT: s_load_dword s4, s[4:5], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b32 s4, s6, s4 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_xor_i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dword s2, s[2:3], 0x0 +; VIS-NEXT: s_load_dword s3, s[4:5], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s0, s2, s3 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dword v[0:1], v2 +; VIS-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, %b @@ -284,6 +480,30 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s7, 0xf000 +; SIS-NEXT: s_mov_b32 s6, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_mov_b32 s4, s0 +; SIS-NEXT: s_xor_b32 s0, s2, s3 +; SIS-NEXT: s_mov_b32 s5, s1 +; SIS-NEXT: v_mov_b32_e32 v0, s0 +; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s2, s2, s3 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s2 +; VIS-NEXT: flat_store_dword v[0:1], v2 +; VIS-NEXT: s_endpgm %result = xor i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void @@ -313,6 +533,30 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_not_i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dword s6, s[4:5], 0xb +; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_not_b32 s4, s6 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_not_i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c +; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_not_b32 s2, s2 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s2 +; VIS-NEXT: flat_store_dword v[0:1], v2 +; VIS-NEXT: s_endpgm %result = xor i32 %a, -1 store i32 %result, ptr addrspace(1) %out ret void @@ -350,6 +594,32 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_not_i32: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dword s4, s[2:3], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_not_b32 s4, s4 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_not_i32: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dword s2, s[2:3], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_not_b32 s0, s2 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dword v[0:1], v2 +; VIS-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, -1 @@ -399,6 +669,38 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_xor_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_xor_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, %b @@ -434,6 +736,34 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SIS-NEXT: s_mov_b32 s7, 0xf000 +; SIS-NEXT: s_mov_b32 s6, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_mov_b32 s4, s0 +; SIS-NEXT: s_mov_b32 s5, s1 +; SIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; SIS-NEXT: v_mov_b32_e32 v0, s0 +; SIS-NEXT: v_mov_b32_e32 v1, s1 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm %result = xor i64 %a, %b store i64 %result, ptr addrspace(1) %out ret void @@ -465,6 +795,32 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_not_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s7, 0xf000 +; SIS-NEXT: s_mov_b32 s6, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_mov_b32 s4, s0 +; SIS-NEXT: s_mov_b32 s5, s1 +; SIS-NEXT: s_not_b64 s[0:1], s[2:3] +; SIS-NEXT: v_mov_b32_e32 v0, s0 +; SIS-NEXT: v_mov_b32_e32 v1, s1 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_not_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_not_b64 s[0:1], s[2:3] +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm %result = xor i64 %a, -1 store i64 %result, ptr addrspace(1) %out ret void @@ -504,6 +860,34 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_not_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_not_b64 s[4:5], s[4:5] +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_not_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_not_b64 s[0:1], s[2:3] +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, -1 @@ -570,6 +954,59 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; VI-NEXT: .LBB12_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_branch .LBB12_2 +; +; SIS-LABEL: xor_cf: +; SIS: ; %bb.0: ; %entry +; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SIS-NEXT: s_mov_b64 s[10:11], 0 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; SIS-NEXT: s_and_b64 vcc, exec, s[8:9] +; SIS-NEXT: s_cbranch_vccz .LBB12_4 +; SIS-NEXT: ; %bb.1: ; %else +; SIS-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; SIS-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_mov_b64 vcc, vcc +; SIS-NEXT: s_cbranch_vccnz .LBB12_3 +; SIS-NEXT: .LBB12_2: ; %if +; SIS-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] +; SIS-NEXT: .LBB12_3: ; %endif +; SIS-NEXT: v_mov_b32_e32 v0, s8 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: v_mov_b32_e32 v1, s9 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; SIS-NEXT: .LBB12_4: +; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9 +; SIS-NEXT: s_branch .LBB12_2 +; +; VIS-LABEL: xor_cf: +; VIS: ; %bb.0: ; %entry +; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VIS-NEXT: s_mov_b64 s[8:9], 0 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VIS-NEXT: s_cbranch_scc0 .LBB12_4 +; VIS-NEXT: ; %bb.1: ; %else +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; VIS-NEXT: s_cbranch_vccnz .LBB12_3 +; VIS-NEXT: .LBB12_2: ; %if +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] +; VIS-NEXT: .LBB12_3: ; %endif +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: v_mov_b32_e32 v2, s2 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_mov_b32_e32 v3, s3 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm +; VIS-NEXT: .LBB12_4: +; VIS-NEXT: ; implicit-def: $sgpr2_sgpr3 +; VIS-NEXT: s_branch .LBB12_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -616,6 +1053,34 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_literal_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b32 s4, s7, 0xf237b +; SIS-NEXT: s_xor_b32 s5, s6, 0x3039 +; SIS-NEXT: v_mov_b32_e32 v0, s5 +; SIS-NEXT: v_mov_b32_e32 v1, s4 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_literal_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s1, s1, 0xf237b +; VIS-NEXT: s_xor_b32 s0, s0, 0x3039 +; VIS-NEXT: v_mov_b32_e32 v2, s2 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_mov_b32_e32 v3, s3 +; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VIS-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out ret void @@ -664,6 +1129,49 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_literal_multi_use_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SIS-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x13 +; SIS-NEXT: s_movk_i32 s8, 0x3039 +; SIS-NEXT: s_mov_b32 s9, 0xf237b +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_add_u32 s0, s6, 0x3039 +; SIS-NEXT: s_addc_u32 s1, s7, 0xf237b +; SIS-NEXT: s_waitcnt expcnt(0) +; SIS-NEXT: v_mov_b32_e32 v0, s0 +; SIS-NEXT: v_mov_b32_e32 v1, s1 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_waitcnt vmcnt(0) +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_literal_multi_use_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c +; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VIS-NEXT: s_movk_i32 s6, 0x3039 +; VIS-NEXT: s_mov_b32 s7, 0xf237b +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; VIS-NEXT: v_mov_b32_e32 v0, s4 +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v1, s5 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: s_add_u32 s0, s2, 0x3039 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_addc_u32 s1, s3, 0xf237b +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; VIS-NEXT: s_waitcnt vmcnt(0) +; VIS-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out @@ -698,6 +1206,32 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_inline_imm_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b32 s4, s6, 63 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s7 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_inline_imm_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s0, s0, 63 +; VIS-NEXT: v_mov_b32_e32 v2, s2 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v3, s3 +; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VIS-NEXT: s_endpgm %or = xor i64 %a, 63 store i64 %or, ptr addrspace(1) %out ret void @@ -729,6 +1263,33 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: scalar_xor_neg_inline_imm_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], -8 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: scalar_xor_neg_inline_imm_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], -8 +; VIS-NEXT: v_mov_b32_e32 v0, s2 +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v1, s3 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm + %or = xor i64 %a, -8 store i64 %or, ptr addrspace(1) %out ret void @@ -768,6 +1329,34 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; VI-NEXT: v_xor_b32_e32 v1, -1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_xor_i64_neg_inline_imm: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], -8 +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_xor_i64_neg_inline_imm: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], -8 +; VIS-NEXT: v_mov_b32_e32 v3, s1 +; VIS-NEXT: v_mov_b32_e32 v2, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, -8 store i64 %or, ptr addrspace(1) %out @@ -808,10 +1397,39 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; SIS-LABEL: vector_xor_literal_i64: +; SIS: ; %bb.0: +; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SIS-NEXT: s_mov_b32 s3, 0xf000 +; SIS-NEXT: s_mov_b32 s2, -1 +; SIS-NEXT: s_waitcnt lgkmcnt(0) +; SIS-NEXT: s_xor_b32 s5, s5, 0x146f +; SIS-NEXT: s_xor_b32 s4, s4, 0xdf77987f +; SIS-NEXT: v_mov_b32_e32 v0, s4 +; SIS-NEXT: v_mov_b32_e32 v1, s5 +; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SIS-NEXT: s_endpgm +; +; VIS-LABEL: vector_xor_literal_i64: +; VIS: ; %bb.0: +; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VIS-NEXT: v_mov_b32_e32 v0, s0 +; VIS-NEXT: v_mov_b32_e32 v1, s1 +; VIS-NEXT: s_waitcnt lgkmcnt(0) +; VIS-NEXT: s_xor_b32 s0, s3, 0x146f +; VIS-NEXT: s_xor_b32 s1, s2, 0xdf77987f +; VIS-NEXT: v_mov_b32_e32 v2, s1 +; VIS-NEXT: v_mov_b32_e32 v3, s0 +; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VIS-NEXT: s_endpgm + %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, 22470723082367 store i64 %or, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} From d51a901195bb93da85b29486ce34c0da51bfb85b Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Thu, 19 Jun 2025 09:03:24 -0500 Subject: [PATCH 03/25] Remove over-enthusiastic clang-format --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 779ce2daf53fc..92baf9bacb1a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4032,8 +4032,9 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( - DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, - uint32_t ValLo, uint32_t ValHi) const { + DAGCombinerInfo &DCI, const SDLoc &SL, + unsigned Opc, SDValue LHS, + uint32_t ValLo, uint32_t ValHi) const { SelectionDAG &DAG = DCI.DAG; SDValue Lo, Hi; std::tie(Lo, Hi) = split64BitValue(LHS, DAG); From 61fa9f73f9012547c0a72999a99f50e13c5cd9e6 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 23 Jun 2025 10:35:11 -0500 Subject: [PATCH 04/25] Respond to some review comments --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +++ llvm/test/CodeGen/AMDGPU/or.ll | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e2162bb56ff08..7b7a637046081 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13366,6 +13366,9 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + // Fold the fneg of a vselect into the v2 vselect operands. + // xor (vselect c, a, b), 0x80000000 -> + // bitcast (vselect c, (fneg (bitcast a)), (fneg (bitcast b))) if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) { const ConstantSDNode *CRHS0 = dyn_cast(RHS.getOperand(0)); diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 26751b289a385..b55c6423a0de8 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -5,9 +5,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_or_b64, particularly in the v2i32 case. See SWDEV-517886. -;; Also removed the previously unused "GCN" check-prefixes from the test. - define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: From 675a024e123117562175ef80e1e03832511bf1ce Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 23 Jun 2025 10:48:35 -0500 Subject: [PATCH 05/25] Add reviewer requested tests --- llvm/test/CodeGen/AMDGPU/or.ll | 102 +++++++++++--------------------- llvm/test/CodeGen/AMDGPU/xor.ll | 98 ++++++++++++++---------------- 2 files changed, 76 insertions(+), 124 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index b55c6423a0de8..f4855c0056b53 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1,9 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6S %s -; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8S %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + + +define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; GFX6-LABEL: s_or_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_or_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog + %result = or <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_or_v2i32(<2 x i32> %num, <2 x i32> %den) { +; GFX6-LABEL: v_or_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_or_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = or <2 x i32> %num, %den + ret <2 x i32> %result +} define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: @@ -43,7 +73,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: or_v2i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -57,7 +86,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX6S-NEXT: v_mov_b32_e32 v1, s5 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: or_v2i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -71,7 +99,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8S-NEXT: v_mov_b32_e32 v1, s5 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: or_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -141,7 +168,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: or_v4i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 @@ -160,7 +186,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX6S-NEXT: v_mov_b32_e32 v3, s3 ; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: or_v4i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 @@ -179,7 +204,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8S-NEXT: v_mov_b32_e32 v3, s3 ; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: or_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -234,7 +258,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -247,7 +270,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX6S-NEXT: v_mov_b32_e32 v0, s0 ; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -260,7 +282,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8S-NEXT: v_mov_b32_e32 v0, s0 ; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -314,7 +335,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -328,7 +348,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -342,7 +361,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -389,7 +407,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_literal_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb @@ -401,7 +418,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_literal_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c @@ -413,7 +429,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -457,7 +472,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_literal_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 @@ -471,7 +485,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX6S-NEXT: v_mov_b32_e32 v1, s4 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_literal_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c @@ -485,7 +498,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8S-NEXT: v_mov_b32_e32 v1, s4 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_literal_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -548,7 +560,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_literal_multi_use_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -571,7 +582,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_waitcnt vmcnt(0) ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_literal_multi_use_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c @@ -593,7 +603,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_waitcnt vmcnt(0) ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_literal_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] @@ -650,7 +659,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_inline_imm_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 @@ -663,7 +671,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX6S-NEXT: v_mov_b32_e32 v1, s7 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_inline_imm_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c @@ -676,7 +683,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8S-NEXT: v_mov_b32_e32 v1, s7 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -736,7 +742,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -758,7 +763,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6S-NEXT: s_waitcnt vmcnt(0) ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -779,7 +783,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8S-NEXT: s_waitcnt vmcnt(0) ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_inline_imm_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] @@ -832,7 +835,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 @@ -845,7 +847,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c @@ -858,7 +859,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_neg_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -912,7 +912,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_literal_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -925,7 +924,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_literal_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -938,7 +936,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -997,7 +994,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8-NEXT: v_or_b32_e32 v0, 4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_inline_immediate_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1010,7 +1006,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_inline_immediate_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1023,7 +1018,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_inline_immediate_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1076,7 +1070,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_or_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1091,7 +1084,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX6S-NEXT: v_mov_b32_e32 v1, s1 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_or_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1106,7 +1098,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8S-NEXT: v_mov_b32_e32 v1, s1 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -1169,7 +1160,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1185,7 +1175,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX6S-NEXT: v_mov_b32_e32 v1, s5 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1201,7 +1190,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8S-NEXT: v_mov_b32_e32 v1, s5 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1268,7 +1256,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: scalar_vector_or_i64: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1283,7 +1270,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX6S-NEXT: v_mov_b32_e32 v1, s5 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: scalar_vector_or_i64: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1298,7 +1284,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8S-NEXT: v_mov_b32_e32 v1, s5 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: scalar_vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1360,7 +1345,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i64_loadimm: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1375,7 +1359,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX6S-NEXT: v_mov_b32_e32 v1, s5 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i64_loadimm: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1390,7 +1373,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8S-NEXT: v_mov_b32_e32 v1, s5 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i64_loadimm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1452,7 +1434,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8-NEXT: v_or_b32_e32 v0, 8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i64_imm: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1466,7 +1447,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX6S-NEXT: v_mov_b32_e32 v1, s5 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i64_imm: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1480,7 +1460,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8S-NEXT: v_mov_b32_e32 v1, s5 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i64_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1541,7 +1520,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8-NEXT: v_or_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i64_neg_inline_imm: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1555,7 +1533,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i64_neg_inline_imm: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1569,7 +1546,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i64_neg_inline_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1632,7 +1608,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: vector_or_i64_neg_literal: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1646,7 +1621,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: vector_or_i64_neg_literal: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1660,7 +1634,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: vector_or_i64_neg_literal: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1711,7 +1684,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: trunc_i64_or_to_i32: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 @@ -1724,7 +1696,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX6S-NEXT: v_mov_b32_e32 v0, s4 ; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: trunc_i64_or_to_i32: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c @@ -1737,7 +1708,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8S-NEXT: v_mov_b32_e32 v0, s4 ; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: trunc_i64_or_to_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1808,7 +1778,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: or_i1: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1828,7 +1797,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: or_i1: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 @@ -1848,7 +1816,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1911,7 +1878,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; GFX6S-LABEL: s_or_i1: ; GFX6S: ; %bb.0: ; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb @@ -1927,7 +1893,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6S-NEXT: s_endpgm -; ; GFX8S-LABEL: s_or_i1: ; GFX8S: ; %bb.0: ; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -1943,7 +1908,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8S-NEXT: s_endpgm -; ; EG-LABEL: s_or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 3808c73ae7de3..d7e780a5ddf74 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -1,12 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SIS %s -; RUN: llc -amdgpu-scalarize-global-loads=true -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VIS %s -;; Added tests with "-amdgpu-scalarize-global-loads=true" to allow the generation of s_xor_b64, particularly in the v2i32 case. See -;; SWDEV-517886. -;; Also removed the previously unused "GCN" check-prefixes from the test. +define amdgpu_ps <2 x i32> @s_xor_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; SI-LABEL: s_xor_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: s_xor_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: ; return to shader part epilog + %result = xor <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_xor_v2i32(<2 x i32> %num, <2 x i32> %den) { +; SI-LABEL: v_xor_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, v1, v3 +; SI-NEXT: v_xor_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_xor_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 +; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] + %result = xor <2 x i32> %num, %den + ret <2 x i32> %result +} define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: @@ -50,7 +76,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: xor_v2i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -66,7 +91,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: xor_v2i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -136,7 +160,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_xor_b32_e32 v0, v0, v4 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: xor_v4i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -157,7 +180,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SIS-NEXT: v_mov_b32_e32 v3, s7 ; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: xor_v4i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -233,7 +255,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: xor_i1: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -254,7 +275,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: xor_i1: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -328,7 +348,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 ; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: v_xor_i1: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -352,7 +371,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 ; SIS-NEXT: v_and_b32_e32 v0, 1, v0 ; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: v_xor_i1: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -419,7 +437,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_xor_i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -434,7 +451,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SIS-NEXT: v_mov_b32_e32 v0, s4 ; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_xor_i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -480,7 +496,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -493,7 +508,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; SIS-NEXT: v_mov_b32_e32 v0, s0 ; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -533,7 +547,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_not_i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dword s6, s[4:5], 0xb @@ -545,7 +558,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SIS-NEXT: v_mov_b32_e32 v0, s4 ; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_not_i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c @@ -594,7 +606,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_not_i32: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -607,7 +618,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SIS-NEXT: v_mov_b32_e32 v0, s4 ; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_not_i32: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -669,7 +679,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_xor_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -685,7 +694,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_xor_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -736,7 +744,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -751,7 +758,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; SIS-NEXT: v_mov_b32_e32 v1, s1 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -795,7 +801,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_not_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -809,7 +814,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SIS-NEXT: v_mov_b32_e32 v1, s1 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_not_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -860,7 +864,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_not_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -874,7 +877,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_not_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -903,7 +905,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 ; SI-NEXT: s_and_b64 vcc, exec, s[10:11] -; SI-NEXT: s_cbranch_vccz .LBB12_4 +; SI-NEXT: s_cbranch_vccz .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, -1 @@ -911,21 +913,21 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz .LBB12_3 -; SI-NEXT: .LBB12_2: ; %if +; SI-NEXT: s_cbranch_vccnz .LBB14_3 +; SI-NEXT: .LBB14_2: ; %if ; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: .LBB12_3: ; %endif +; SI-NEXT: .LBB14_3: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB12_4: +; SI-NEXT: .LBB14_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB12_2 +; SI-NEXT: s_branch .LBB14_2 ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry @@ -933,28 +935,27 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-NEXT: s_cbranch_scc0 .LBB12_4 +; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VI-NEXT: s_cbranch_vccnz .LBB12_3 -; VI-NEXT: .LBB12_2: ; %if +; VI-NEXT: s_cbranch_vccnz .LBB14_3 +; VI-NEXT: .LBB14_2: ; %if ; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: .LBB12_3: ; %endif +; VI-NEXT: .LBB14_3: ; %endif ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB12_4: +; VI-NEXT: .LBB14_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB12_2 -; +; VI-NEXT: s_branch .LBB14_2 ; SIS-LABEL: xor_cf: ; SIS: ; %bb.0: ; %entry ; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 @@ -981,7 +982,6 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; SIS-NEXT: .LBB12_4: ; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9 ; SIS-NEXT: s_branch .LBB12_2 -; ; VIS-LABEL: xor_cf: ; VIS: ; %bb.0: ; %entry ; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 @@ -1053,7 +1053,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_literal_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 @@ -1067,7 +1066,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; SIS-NEXT: v_mov_b32_e32 v1, s4 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_literal_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c @@ -1129,7 +1127,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_literal_multi_use_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 @@ -1151,7 +1148,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_waitcnt vmcnt(0) ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_literal_multi_use_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c @@ -1206,7 +1202,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_inline_imm_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 @@ -1219,7 +1214,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; SIS-NEXT: v_mov_b32_e32 v1, s7 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_inline_imm_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c @@ -1263,7 +1257,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: scalar_xor_neg_inline_imm_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 @@ -1276,7 +1269,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: scalar_xor_neg_inline_imm_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c @@ -1329,7 +1321,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; VI-NEXT: v_xor_b32_e32 v1, -1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_xor_i64_neg_inline_imm: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1343,7 +1334,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_xor_i64_neg_inline_imm: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1397,7 +1387,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; ; SIS-LABEL: vector_xor_literal_i64: ; SIS: ; %bb.0: ; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 @@ -1412,7 +1401,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; SIS-NEXT: v_mov_b32_e32 v1, s5 ; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SIS-NEXT: s_endpgm -; ; VIS-LABEL: vector_xor_literal_i64: ; VIS: ; %bb.0: ; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 From c0092d0ab634438f0e9620b8f1816705dbb84468 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 23 Jun 2025 10:54:16 -0500 Subject: [PATCH 06/25] Suppress over-enthusiastic clang-format --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++-- llvm/test/CodeGen/AMDGPU/or.ll | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 92baf9bacb1a0..9a3a326bd3588 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4944,8 +4944,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (Inv) std::swap(NewLHS, NewRHS); - SDValue NewSelect = - DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS); + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, + Cond, NewLHS, NewRHS); DCI.AddToWorklist(NewSelect.getNode()); return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); } diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index f4855c0056b53..b1d9c665ebf08 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s - +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { ; GFX6-LABEL: s_or_v2i32: From f58cac592b0434b069c7911f6b247d2eadf822bc Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 23 Jun 2025 13:01:29 -0500 Subject: [PATCH 07/25] Temporarily remove r600 from or.ll test --- llvm/test/CodeGen/AMDGPU/or.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index b1d9c665ebf08..0a71a644652fe 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { ; GFX6-LABEL: s_or_v2i32: From 7ce16e8757cac52925e7648555001195d4612542 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 24 Jun 2025 06:11:18 -0500 Subject: [PATCH 08/25] Add SGPR and VGPR tests to and.ll and temporarily remove the r600 run line. --- llvm/test/CodeGen/AMDGPU/and.ll | 79 ++++++++++++++------------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index e5fe9195e2dcc..4673df3183cfa 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -1,10 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s -; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s declare i32 @llvm.amdgcn.workitem.id.x() #0 +define amdgpu_ps <2 x i32> @s_and_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; GFX6-LABEL: s_and_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_and_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog + %result = and <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_and_v2i32(<2 x i32> %num, <2 x i32> %den) { +; GFX6-LABEL: v_and_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_and_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = and <2 x i32> %num, %den + ret <2 x i32> %result +} + define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test2: ; GFX6: ; %bb.0: @@ -14,8 +45,7 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s5, s7 -; GFX6-NEXT: s_and_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -29,13 +59,11 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s5, s7 -; GFX8-NEXT: s_and_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: test2: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -99,7 +127,6 @@ define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: test4: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -154,7 +181,6 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -194,7 +220,6 @@ define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_constant_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -252,7 +277,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_multi_use_constant_i32_0: ; EG: ; %bb.0: ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] @@ -309,7 +333,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_multi_use_constant_i32_1: ; EG: ; %bb.0: ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] @@ -371,7 +394,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_i32_vgpr_vgpr: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] @@ -440,7 +462,6 @@ define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, pt ; GFX8-NEXT: v_and_b32_e32 v2, s2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_i32_sgpr_vgpr: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -504,7 +525,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_i32_vgpr_sgpr: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -568,7 +588,6 @@ define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_constant_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -630,7 +649,6 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_inline_imm_64_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -692,7 +710,6 @@ define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_inline_imm_neg_16_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -749,7 +766,6 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -794,7 +810,6 @@ define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[], KC1[] @@ -857,7 +872,6 @@ define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_constant_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -921,7 +935,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_multi_use_constant_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[] @@ -975,7 +988,6 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_32_bit_constant_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -1046,7 +1058,6 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_multi_use_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[] @@ -1130,7 +1141,6 @@ define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] @@ -1199,7 +1209,6 @@ define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0xab19b207, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_constant_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -1280,7 +1289,6 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_multi_use_constant_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] @@ -1382,7 +1390,6 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_multi_use_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] @@ -1466,7 +1473,6 @@ define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_i64_32_bit_constant: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -1530,7 +1536,6 @@ define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -1595,7 +1600,6 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_and_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: v_and_inline_neg_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] @@ -1648,7 +1652,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_64_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -1699,7 +1702,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_64_i64_noshrink: ; EG: ; %bb.0: ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] @@ -1748,7 +1750,6 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_1_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -1791,7 +1792,6 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_1.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -1835,7 +1835,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_neg_1.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -1879,7 +1878,6 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_0.5_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -1923,7 +1921,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_neg_0.5_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -1967,7 +1964,6 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_2.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -2011,7 +2007,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_neg_2.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -2055,7 +2050,6 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -2099,7 +2093,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_neg_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -2146,7 +2139,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_f32_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -2189,7 +2181,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -2234,7 +2225,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -2278,7 +2268,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; ; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] From 109e482267822c1488a35e669bbd420cf3cd26b2 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 24 Jun 2025 06:50:20 -0500 Subject: [PATCH 09/25] Remove dead check-lines from or.ll --- llvm/test/CodeGen/AMDGPU/or.ll | 625 --------------------------------- 1 file changed, 625 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 0a71a644652fe..0da53f2a95953 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -72,32 +72,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: or_v2i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s5 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: or_v2i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s5 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: or_v2i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -167,42 +141,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: or_v4i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX6S-NEXT: s_mov_b32 s11, 0xf000 -; GFX6S-NEXT: s_mov_b32 s10, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s3, s3, s7 -; GFX6S-NEXT: s_or_b32 s2, s2, s6 -; GFX6S-NEXT: s_or_b32 s1, s1, s5 -; GFX6S-NEXT: s_or_b32 s0, s0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: v_mov_b32_e32 v1, s1 -; GFX6S-NEXT: v_mov_b32_e32 v2, s2 -; GFX6S-NEXT: v_mov_b32_e32 v3, s3 -; GFX6S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: or_v4i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX8S-NEXT: s_mov_b32 s11, 0xf000 -; GFX8S-NEXT: s_mov_b32 s10, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s3, s3, s7 -; GFX8S-NEXT: s_or_b32 s2, s2, s6 -; GFX8S-NEXT: s_or_b32 s1, s1, s5 -; GFX8S-NEXT: s_or_b32 s0, s0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: v_mov_b32_e32 v1, s1 -; GFX8S-NEXT: v_mov_b32_e32 v2, s2 -; GFX8S-NEXT: v_mov_b32_e32 v3, s3 -; GFX8S-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: or_v4i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -257,30 +195,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s7, 0xf000 -; GFX6S-NEXT: s_mov_b32 s6, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_mov_b32 s4, s0 -; GFX6S-NEXT: s_or_b32 s0, s2, s3 -; GFX6S-NEXT: s_mov_b32 s5, s1 -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s7, 0xf000 -; GFX8S-NEXT: s_mov_b32 s6, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_mov_b32 s4, s0 -; GFX8S-NEXT: s_or_b32 s0, s2, s3 -; GFX8S-NEXT: s_mov_b32 s5, s1 -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -334,32 +248,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dword s4, s[4:5], 0xd -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s5, s4 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dword s4, s[4:5], 0x34 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s5, s4 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -406,28 +294,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_literal_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dword s6, s[4:5], 0xb -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s6, 0x1869f -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_literal_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x2c -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s6, 0x1869f -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -471,32 +337,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_literal_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s7, 0xf237b -; GFX6S-NEXT: s_or_b32 s5, s6, 0x3039 -; GFX6S-NEXT: v_mov_b32_e32 v0, s5 -; GFX6S-NEXT: v_mov_b32_e32 v1, s4 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_literal_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s7, 0xf237b -; GFX8S-NEXT: s_or_b32 s5, s6, 0x3039 -; GFX8S-NEXT: v_mov_b32_e32 v0, s5 -; GFX8S-NEXT: v_mov_b32_e32 v1, s4 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_literal_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -559,49 +399,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_literal_multi_use_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d -; GFX6S-NEXT: s_movk_i32 s8, 0x3039 -; GFX6S-NEXT: s_mov_b32 s9, 0xf237b -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6S-NEXT: v_mov_b32_e32 v0, s6 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: v_mov_b32_e32 v1, s7 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_add_u32 s0, s4, 0x3039 -; GFX6S-NEXT: s_addc_u32 s1, s5, 0xf237b -; GFX6S-NEXT: s_waitcnt expcnt(0) -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: v_mov_b32_e32 v1, s1 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_waitcnt vmcnt(0) -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_literal_multi_use_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 -; GFX8S-NEXT: s_movk_i32 s8, 0x3039 -; GFX8S-NEXT: s_mov_b32 s9, 0xf237b -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8S-NEXT: v_mov_b32_e32 v0, s6 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: v_mov_b32_e32 v1, s7 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_add_u32 s0, s4, 0x3039 -; GFX8S-NEXT: s_addc_u32 s1, s5, 0xf237b -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: v_mov_b32_e32 v1, s1 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_waitcnt vmcnt(0) -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_literal_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] @@ -658,30 +455,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_inline_imm_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s6, 63 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s7 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_inline_imm_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s6, 63 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s7 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -741,47 +514,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_inline_imm_multi_use_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6S-NEXT: s_mov_b32 s7, 0xf000 -; GFX6S-NEXT: s_mov_b32 s6, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_mov_b32 s4, s0 -; GFX6S-NEXT: s_or_b32 s0, s2, 63 -; GFX6S-NEXT: s_mov_b32 s5, s1 -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: v_mov_b32_e32 v1, s3 -; GFX6S-NEXT: s_add_u32 s0, s8, 63 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX6S-NEXT: s_addc_u32 s1, s9, 0 -; GFX6S-NEXT: s_waitcnt expcnt(0) -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: v_mov_b32_e32 v1, s1 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX6S-NEXT: s_waitcnt vmcnt(0) -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_inline_imm_multi_use_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; GFX8S-NEXT: s_mov_b32 s7, 0xf000 -; GFX8S-NEXT: s_mov_b32 s6, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_mov_b32 s4, s0 -; GFX8S-NEXT: s_or_b32 s0, s2, 63 -; GFX8S-NEXT: s_mov_b32 s5, s1 -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: v_mov_b32_e32 v1, s3 -; GFX8S-NEXT: s_add_u32 s0, s8, 63 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8S-NEXT: s_addc_u32 s1, s9, 0 -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: v_mov_b32_e32 v1, s1 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8S-NEXT: s_waitcnt vmcnt(0) -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_inline_imm_multi_use_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] @@ -834,30 +566,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_neg_inline_imm_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: v_mov_b32_e32 v1, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s6, -8 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_neg_inline_imm_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: v_mov_b32_e32 v1, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s6, -8 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_neg_inline_imm_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -911,30 +619,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_literal_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s4, 0xffff -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_literal_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s4, 0xffff -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_literal_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -993,30 +677,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8-NEXT: v_or_b32_e32 v0, 4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_inline_immediate_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s4, 4 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_inline_immediate_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s4, 4 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_inline_immediate_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1069,34 +729,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_or_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6S-NEXT: s_mov_b32 s7, 0xf000 -; GFX6S-NEXT: s_mov_b32 s6, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_mov_b32 s4, s0 -; GFX6S-NEXT: s_mov_b32 s5, s1 -; GFX6S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX6S-NEXT: v_mov_b32_e32 v0, s0 -; GFX6S-NEXT: v_mov_b32_e32 v1, s1 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_or_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; GFX8S-NEXT: s_mov_b32 s7, 0xf000 -; GFX8S-NEXT: s_mov_b32 s6, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_mov_b32 s4, s0 -; GFX8S-NEXT: s_mov_b32 s5, s1 -; GFX8S-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX8S-NEXT: v_mov_b32_e32 v0, s0 -; GFX8S-NEXT: v_mov_b32_e32 v1, s1 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] @@ -1159,36 +791,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s5 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s5 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1255,34 +857,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: scalar_vector_or_i64: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s5 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: scalar_vector_or_i64: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s5 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: scalar_vector_or_i64: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1344,34 +918,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i64_loadimm: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s5, s5, 0x146f -; GFX6S-NEXT: s_or_b32 s4, s4, 0xdf77987f -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s5 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i64_loadimm: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s5, s5, 0x146f -; GFX8S-NEXT: s_or_b32 s4, s4, 0xdf77987f -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s5 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i64_loadimm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1433,32 +979,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8-NEXT: v_or_b32_e32 v0, 8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i64_imm: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s4, 8 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: v_mov_b32_e32 v1, s5 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i64_imm: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s4, 8 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: v_mov_b32_e32 v1, s5 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i64_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1519,32 +1039,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8-NEXT: v_or_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i64_neg_inline_imm: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: v_mov_b32_e32 v1, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s4, -8 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i64_neg_inline_imm: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: v_mov_b32_e32 v1, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s4, -8 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i64_neg_inline_imm: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1607,32 +1101,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: vector_or_i64_neg_literal: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: v_mov_b32_e32 v1, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s4, 0xffffff38 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: vector_or_i64_neg_literal: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: v_mov_b32_e32 v1, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s4, 0xffffff38 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: vector_or_i64_neg_literal: ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1683,30 +1151,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: trunc_i64_or_to_i32: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dword s6, s[4:5], 0x13 -; GFX6S-NEXT: s_load_dword s7, s[4:5], 0x1d -; GFX6S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s3, 0xf000 -; GFX6S-NEXT: s_mov_b32 s2, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_or_b32 s4, s7, s6 -; GFX6S-NEXT: v_mov_b32_e32 v0, s4 -; GFX6S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: trunc_i64_or_to_i32: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dword s6, s[4:5], 0x4c -; GFX8S-NEXT: s_load_dword s7, s[4:5], 0x74 -; GFX8S-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s3, 0xf000 -; GFX8S-NEXT: s_mov_b32 s2, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_or_b32 s4, s7, s6 -; GFX8S-NEXT: v_mov_b32_e32 v0, s4 -; GFX8S-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: trunc_i64_or_to_i32: ; EG: ; %bb.0: ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1777,44 +1221,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: or_i1: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s7, 0xf000 -; GFX6S-NEXT: s_mov_b32 s6, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_load_dword s8, s[8:9], 0x0 -; GFX6S-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX6S-NEXT: s_mov_b32 s4, s0 -; GFX6S-NEXT: s_mov_b32 s5, s1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: v_mul_f32_e64 v0, 1.0, s8 -; GFX6S-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GFX6S-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX6S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 -; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6S-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: or_i1: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s7, 0xf000 -; GFX8S-NEXT: s_mov_b32 s6, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_load_dword s8, s[8:9], 0x0 -; GFX8S-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX8S-NEXT: s_mov_b32 s4, s0 -; GFX8S-NEXT: s_mov_b32 s5, s1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: v_mul_f32_e64 v0, 1.0, s8 -; GFX8S-NEXT: v_mul_f32_e64 v1, 1.0, s2 -; GFX8S-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX8S-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 -; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8S-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -1877,36 +1283,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; GFX6S-LABEL: s_or_i1: -; GFX6S: ; %bb.0: -; GFX6S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GFX6S-NEXT: s_mov_b32 s7, 0xf000 -; GFX6S-NEXT: s_mov_b32 s6, -1 -; GFX6S-NEXT: s_waitcnt lgkmcnt(0) -; GFX6S-NEXT: s_cmp_eq_u32 s0, s1 -; GFX6S-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6S-NEXT: s_cmp_eq_u32 s2, s3 -; GFX6S-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX6S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX6S-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; GFX6S-NEXT: s_endpgm -; GFX8S-LABEL: s_or_i1: -; GFX8S: ; %bb.0: -; GFX8S-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8S-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 -; GFX8S-NEXT: s_mov_b32 s7, 0xf000 -; GFX8S-NEXT: s_mov_b32 s6, -1 -; GFX8S-NEXT: s_waitcnt lgkmcnt(0) -; GFX8S-NEXT: s_cmp_eq_u32 s0, s1 -; GFX8S-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX8S-NEXT: s_cmp_eq_u32 s2, s3 -; GFX8S-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX8S-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX8S-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX8S-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; GFX8S-NEXT: s_endpgm ; EG-LABEL: s_or_i1: ; EG: ; %bb.0: ; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] @@ -1935,4 +1311,3 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c store i1 %or, ptr addrspace(1) %out ret void } - From e0f517fceee45bba5eae075478aeb0afb02ef6a5 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 24 Jun 2025 07:09:04 -0500 Subject: [PATCH 10/25] Apply reviewer comments to performFNegCombine --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9a3a326bd3588..b927e51905d41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5375,15 +5375,13 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Cond = N0.getOperand(0); SDValue LHS = N0.getOperand(1); SDValue RHS = N0.getOperand(2); - EVT LHVT = LHS.getValueType(); - EVT RHVT = RHS.getValueType(); - // The regression was limited to i32 v2/i32. - if (RHVT != MVT::i32 && LHVT != MVT::i32) + EVT VT = LHS.getValueType(); + if (VT != MVT::i32) return SDValue(); - SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS); - SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS); - SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg); + SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, VT, LHS); + SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, VT, RHS); + SDValue Op = DAG.getNode(Opc, SL, VT, Cond, LFNeg, RFNeg); return Op; } case ISD::BITCAST: { From ede00b0a26822c8cb923cbfc82c33fd84a922c8f Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 24 Jun 2025 07:12:41 -0500 Subject: [PATCH 11/25] Remove dead code --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b927e51905d41..6a764e9948136 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5366,23 +5366,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, } case ISD::SELECT: { // fneg (select c, a, b) -> select c, (fneg a), (fneg b) - // This combine became necessary recently to prevent a regression in - // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor. - // Specifically, additional instructions were added to the final codegen. - // When adding this combine a case was added to performFNEGCombine to - // prevent this combine from being undone under certain conditions. // TODO: Invert conditions of foldFreeOpFromSelect - SDValue Cond = N0.getOperand(0); - SDValue LHS = N0.getOperand(1); - SDValue RHS = N0.getOperand(2); - EVT VT = LHS.getValueType(); - if (VT != MVT::i32) - return SDValue(); - - SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, VT, LHS); - SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, VT, RHS); - SDValue Op = DAG.getNode(Opc, SL, VT, Cond, LFNeg, RFNeg); - return Op; + return SDValue(); } case ISD::BITCAST: { SDLoc SL(N); From c5904d80dd24ca9dd352befa59055e4105000caa Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 24 Jun 2025 10:36:20 -0500 Subject: [PATCH 12/25] Re-enstate r600 tests in independent files. This action has already taken place for tens of other tests. --- llvm/test/CodeGen/AMDGPU/and.ll | 659 +----------------- llvm/test/CodeGen/AMDGPU/and.r600.ll | 987 +++++++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/or.ll | 350 ---------- llvm/test/CodeGen/AMDGPU/or.r600.ll | 515 ++++++++++++++ 4 files changed, 1503 insertions(+), 1008 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/and.r600.ll create mode 100644 llvm/test/CodeGen/AMDGPU/or.r600.ll diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index 4673df3183cfa..29bfc253e2e7e 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s + declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_ps <2 x i32> @s_and_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { @@ -64,23 +65,6 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: test2: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W, -; EG-NEXT: AND_INT T0.X, T0.X, T0.Z, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -127,26 +111,6 @@ define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: test4: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT * T0.W, T0.W, T1.W, -; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z, -; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -181,16 +145,6 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i32 %a, %b store i32 %and, ptr addrspace(1) %out, align 4 ret void @@ -220,16 +174,6 @@ define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_constant_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39) %and = and i32 %a, 1234567 store i32 %and, ptr addrspace(1) %out, align 4 ret void @@ -277,19 +221,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_multi_use_constant_i32_0: -; EG: ; %bb.0: -; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV T0.X, literal.x, -; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i32 %a, 1234567 ; Just to stop future replacement of copy to vgpr + store with VALU op. @@ -333,19 +264,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_multi_use_constant_i32_1: -; EG: ; %bb.0: -; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W, -; EG-NEXT: ADD_INT T0.X, PV.W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %and = and i32 %a, 1234567 %foo = add i32 %and, 1234567 %bar = add i32 %foo, %b @@ -394,27 +312,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_i32_vgpr_vgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid @@ -462,25 +359,6 @@ define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, pt ; GFX8-NEXT: v_and_b32_e32 v2, s2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_i32_sgpr_vgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -525,25 +403,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_i32_vgpr_sgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -588,24 +447,6 @@ define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_constant_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -649,24 +490,6 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_inline_imm_64_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -710,24 +533,6 @@ define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_inline_imm_neg_16_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -16(nan), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -766,17 +571,6 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z, -; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, %b store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -810,33 +604,6 @@ define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 -; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T1.X, T0.X, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PS, 1, -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T0.X, PV.W, PS, -; EG-NEXT: LSHL * T0.W, literal.x, PS, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: MOV * T0.Z, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i1 %a, %b store i1 %and, ptr addrspace(1) %out ret void @@ -872,18 +639,6 @@ define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45) %and = and i64 %a, 549756338176 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -935,26 +690,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_multi_use_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y, -; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43) -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T2.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y, -; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y, -; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40) %and0 = and i64 %a, 549756338176 %and1 = and i64 %b, 549756338176 store volatile i64 %and0, ptr addrspace(1) %out @@ -988,17 +723,6 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_32_bit_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %and = and i64 %a, 1234567 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1058,33 +782,6 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_multi_use_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: LSHL T0.W, KC0[3].W, 1, -; EG-NEXT: LSHL * T1.W, KC0[2].W, 1, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W, -; EG-NEXT: AND_INT T1.W, T1.W, literal.x, -; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, -; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS, -; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, -; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl.a = shl i64 %a, 1 %shl.b = shl i64 %b, 1 %and0 = and i64 %shl.a, 62 @@ -1141,27 +838,6 @@ define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1209,26 +885,6 @@ define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0xab19b207, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x, -; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1289,46 +945,6 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_multi_use_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @12 -; EG-NEXT: ALU 0, @22, KC0[], KC1[] -; EG-NEXT: TEX 0 @14 -; EG-NEXT: ALU 0, @23, KC0[], KC1[] -; EG-NEXT: TEX 1 @16 -; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 20: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, PV.X, -; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T2.X, T0.X, -; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV * T3.X, T0.X, -; EG-NEXT: ALU clause starting at 24: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: AND_INT * T3.X, T3.X, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43) -; EG-NEXT: AND_INT T2.X, T2.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) -; EG-NEXT: AND_INT T1.X, T1.X, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45) -; EG-NEXT: LSHR * T5.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load volatile i64, ptr addrspace(1) %aptr %b = load volatile i64, ptr addrspace(1) %aptr %and0 = and i64 %a, 1231231234567 @@ -1390,44 +1006,6 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_multi_use_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @12 -; EG-NEXT: ALU 0, @22, KC0[], KC1[] -; EG-NEXT: TEX 0 @14 -; EG-NEXT: ALU 0, @23, KC0[], KC1[] -; EG-NEXT: TEX 1 @16 -; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 20: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, PV.X, -; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T1.X, T0.X, -; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV * T2.X, T0.X, -; EG-NEXT: ALU clause starting at 24: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: AND_INT * T1.X, T1.X, literal.x, -; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) -; EG-NEXT: LSHR T3.X, PV.W, literal.x, -; EG-NEXT: MOV * T4.X, literal.y, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load volatile i64, ptr addrspace(1) %aptr %b = load volatile i64, ptr addrspace(1) %aptr %and0 = and i64 %a, 63 @@ -1473,25 +1051,6 @@ define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_i64_32_bit_constant: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1536,25 +1095,6 @@ define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1600,24 +1140,6 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_and_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: v_and_inline_neg_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1652,17 +1174,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_64_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %and = and i64 %a, 64 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1702,21 +1213,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_64_i64_noshrink: -; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHL * T0.W, KC0[2].W, 1, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W, -; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W, %shl = shl i64 %a, 1 %and = and i64 %shl, 64 %add = add i64 %and, %b @@ -1750,17 +1246,6 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_1_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, 1, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 1 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1792,18 +1277,6 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_1.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4607182418800017408 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1835,18 +1308,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_neg_1.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13830554455654793216 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1878,18 +1339,6 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_0.5_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4602678819172646912 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1921,18 +1370,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_neg_0.5_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13826050856027422720 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1964,18 +1401,6 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_2.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4611686018427387904 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2007,18 +1432,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_neg_2.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13835058055282163712 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2050,18 +1463,6 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4616189618054758400 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2093,18 +1494,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13839561654909534208 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2139,17 +1528,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_f32_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45) %and = and i64 %a, 1082130432 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2181,17 +1559,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV * T0.Y, KC0[3].X, -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45) %and = and i64 %a, -1065353216 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2225,18 +1592,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4647714815446351872 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2268,18 +1623,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13871086852301127680 store i64 %and, ptr addrspace(1) %out, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/and.r600.ll b/llvm/test/CodeGen/AMDGPU/and.r600.ll new file mode 100644 index 0000000000000..590b1ac899fcf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/and.r600.ll @@ -0,0 +1,987 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + + +define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: test2: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W, +; EG-NEXT: AND_INT T0.X, T0.X, T0.Z, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr + %result = and <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: test4: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT * T0.W, T0.W, T1.W, +; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z, +; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr + %result = and <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) { +; EG-LABEL: s_and_constant_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39) + %and = and i32 %a, 1234567 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +; FIXME: We should really duplicate the constant so that the SALU use +; can fold into the s_and_b32 and the VALU one is materialized +; directly without copying from the SGPR. + +; Second use is a VGPR use of the constant. + +define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_multi_use_constant_i32_0: +; EG: ; %bb.0: +; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV T0.X, literal.x, +; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W, +; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i32 %a, 1234567 + + ; Just to stop future replacement of copy to vgpr + store with VALU op. + %foo = add i32 %and, %b + store volatile i32 %foo, ptr addrspace(1) %out + store volatile i32 1234567, ptr addrspace(1) %out + ret void +} + +; Second use is another SGPR use of the constant. + +define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_multi_use_constant_i32_1: +; EG: ; %bb.0: +; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W, +; EG-NEXT: ADD_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %and = and i32 %a, 1234567 + %foo = add i32 %and, 1234567 + %bar = add i32 %foo, %b + store volatile i32 %bar, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i32_vgpr_vgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %a = load i32, ptr addrspace(1) %gep.a + %b = load i32, ptr addrspace(1) %gep.b + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i32_sgpr_vgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %b = load i32, ptr addrspace(1) %gep.b + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i32 %b) { +; EG-LABEL: v_and_i32_vgpr_sgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %a = load i32, ptr addrspace(1) %gep.a + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_constant_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, 1234567 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_64_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, 64 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_neg_16_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -16(nan), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, -16 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: s_and_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z, +; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, %b + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) { +; EG-LABEL: s_and_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T1.X, T0.X, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, 1, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i1 %a, %b + store i1 %and, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) { +; EG-LABEL: s_and_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45) + %and = and i64 %a, 549756338176 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: s_and_multi_use_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y, +; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43) +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y, +; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) +; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y, +; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40) + %and0 = and i64 %a, 549756338176 + %and1 = and i64 %b, 549756338176 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, i64 %a) { +; EG-LABEL: s_and_32_bit_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %and = and i64 %a, 1234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) { +; EG-LABEL: s_and_multi_use_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: LSHL T0.W, KC0[3].W, 1, +; EG-NEXT: LSHL * T1.W, KC0[2].W, 1, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, +; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS, +; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, +; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %shl.a = shl i64 %a, 1 + %shl.b = shl i64 %b, 1 + %and0 = and i64 %shl.a, 62 + %and1 = and i64 %shl.b, 62 + %add0 = add i64 %and0, %c + %add1 = add i64 %and1, %c + store volatile i64 %add0, ptr addrspace(1) %out + store volatile i64 %add1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %gep.b = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid + %b = load i64, ptr addrspace(1) %gep.b, align 8 + %and = and i64 %a, %b + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x, +; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 1231231234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_multi_use_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @12 +; EG-NEXT: ALU 0, @22, KC0[], KC1[] +; EG-NEXT: TEX 0 @14 +; EG-NEXT: ALU 0, @23, KC0[], KC1[] +; EG-NEXT: TEX 1 @16 +; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1 +; EG-NEXT: Fetch clause starting at 16: +; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 20: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, PV.X, +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MOV * T2.X, T0.X, +; EG-NEXT: ALU clause starting at 23: +; EG-NEXT: MOV * T3.X, T0.X, +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: AND_INT * T3.X, T3.X, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43) +; EG-NEXT: AND_INT T2.X, T2.X, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) +; EG-NEXT: AND_INT T1.X, T1.X, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45) +; EG-NEXT: LSHR * T5.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load volatile i64, ptr addrspace(1) %aptr + %b = load volatile i64, ptr addrspace(1) %aptr + %and0 = and i64 %a, 1231231234567 + %and1 = and i64 %b, 1231231234567 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_multi_use_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @12 +; EG-NEXT: ALU 0, @22, KC0[], KC1[] +; EG-NEXT: TEX 0 @14 +; EG-NEXT: ALU 0, @23, KC0[], KC1[] +; EG-NEXT: TEX 1 @16 +; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: Fetch clause starting at 16: +; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 20: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, PV.X, +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MOV * T1.X, T0.X, +; EG-NEXT: ALU clause starting at 23: +; EG-NEXT: MOV * T2.X, T0.X, +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: AND_INT * T1.X, T1.X, literal.x, +; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: MOV * T4.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load volatile i64, ptr addrspace(1) %aptr + %b = load volatile i64, ptr addrspace(1) %aptr + %and0 = and i64 %a, 63 + %and1 = and i64 %b, 63 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_i64_32_bit_constant: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 1234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 64 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; FIXME: Should be able to reduce load width + +define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_neg_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, -8 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_64_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %and = and i64 %a, 64 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) { +; EG-LABEL: s_and_inline_imm_64_i64_noshrink: +; EG: ; %bb.0: +; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHL * T0.W, KC0[2].W, 1, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W, +; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W, + %shl = shl i64 %a, 1 + %and = and i64 %shl, 64 + %add = add i64 %and, %b + store i64 %add, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_1_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, 1, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 1 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_1.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4607182418800017408 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_1.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13830554455654793216 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_0.5_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4602678819172646912 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_0.5_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13826050856027422720 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_2.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4611686018427387904 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_2.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13835058055282163712 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4616189618054758400 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13839561654909534208 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; Test with the 64-bit integer bitpattern for a 32-bit float in the +; low 32-bits, which is not a valid 64-bit inline immmediate. + +define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_f32_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45) + %and = and i64 %a, 1082130432 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV * T0.Y, KC0[3].X, +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45) + %and = and i64 %a, -1065353216 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; Shift into upper 32-bits + +define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4647714815446351872 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13871086852301127680 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 0da53f2a95953..728067edcf399 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -72,23 +72,6 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: or_v2i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W, -; EG-NEXT: OR_INT T0.X, T0.X, T0.Z, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -141,26 +124,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: or_v4i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: OR_INT * T0.W, T0.W, T1.W, -; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z, -; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: OR_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -195,16 +158,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %or = or i32 %a, %b store i32 %or, ptr addrspace(1) %out ret void @@ -248,22 +201,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i32, ptr addrspace(1) %a %or = or i32 %loada, %b store i32 %or, ptr addrspace(1) %out @@ -294,16 +231,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_literal_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40) %or = or i32 %a, 99999 store i32 %or, ptr addrspace(1) %out, align 4 ret void @@ -337,18 +264,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_literal_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) %or = or i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out ret void @@ -399,28 +314,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_literal_multi_use_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x, -; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W, -; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, -; EG-NEXT: MOV * T2.X, literal.y, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) %or = or i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out @@ -455,17 +348,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV * T0.Y, KC0[5].X, -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) %or = or i64 %a, 63 store i64 %or, ptr addrspace(1) %out ret void @@ -514,25 +396,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_inline_imm_multi_use_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x, -; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x, -; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W, -; EG-NEXT: MOV * T2.X, literal.x, -; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV * T3.Y, KC0[3].X, -; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) %or = or i64 %a, 63 store i64 %or, ptr addrspace(1) %out %foo = add i64 %b, 63 @@ -566,18 +429,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_neg_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %or = or i64 %a, -8 store i64 %or, ptr addrspace(1) %out ret void @@ -619,22 +470,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_literal_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) %loada = load i32, ptr addrspace(1) %a, align 4 %or = or i32 %loada, 65535 store i32 %or, ptr addrspace(1) %out, align 4 @@ -677,22 +512,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8-NEXT: v_or_b32_e32 v0, 4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_inline_immediate_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) %loada = load i32, ptr addrspace(1) %a, align 4 %or = or i32 %loada, 4 store i32 %or, ptr addrspace(1) %out, align 4 @@ -729,17 +548,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z, -; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %or = or i64 %a, %b store i64 %or, ptr addrspace(1) %out ret void @@ -791,25 +599,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, KC0[2].W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: OR_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %loadb = load i64, ptr addrspace(1) %b, align 8 %or = or i64 %loada, %loadb @@ -857,23 +646,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: scalar_vector_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X, -; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a %or = or i64 %loada, %b store i64 %or, ptr addrspace(1) %out @@ -918,24 +690,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i64_loadimm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x, -; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00) -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, 22470723082367 store i64 %or, ptr addrspace(1) %out @@ -979,22 +733,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8-NEXT: v_or_b32_e32 v0, 8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i64_imm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, 8 store i64 %or, ptr addrspace(1) %out @@ -1039,24 +777,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8-NEXT: v_or_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i64_neg_inline_imm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, -8 store i64 %or, ptr addrspace(1) %out @@ -1101,24 +821,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: vector_or_i64_neg_literal: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -200(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, -200 store i64 %or, ptr addrspace(1) %out @@ -1151,16 +853,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: trunc_i64_or_to_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W, %add = or i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, ptr addrspace(1) %out, align 8 @@ -1221,26 +913,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: or_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, KC0[2].W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, -; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, -; EG-NEXT: AND_INT T0.X, PV.W, 1, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -1283,28 +955,6 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; EG-LABEL: s_or_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y, -; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W, -; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PS, 1, -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T0.X, PV.W, PS, -; EG-NEXT: LSHL * T0.W, literal.x, PS, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: MOV * T0.Z, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %or = or i1 %cmp0, %cmp1 diff --git a/llvm/test/CodeGen/AMDGPU/or.r600.ll b/llvm/test/CodeGen/AMDGPU/or.r600.ll new file mode 100644 index 0000000000000..ed9d0085fd82a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/or.r600.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + + +define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: or_v2i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W, +; EG-NEXT: OR_INT T0.X, T0.X, T0.Z, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr + %result = or <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: or_v4i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: OR_INT * T0.W, T0.W, T1.W, +; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z, +; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: OR_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr + %result = or <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: scalar_or_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %or = or i32 %a, %b + store i32 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { +; EG-LABEL: vector_or_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i32, ptr addrspace(1) %a + %or = or i32 %loada, %b + store i32 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { +; EG-LABEL: scalar_or_literal_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40) + %or = or i32 %a, 99999 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_literal_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = or i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; EG-LABEL: scalar_or_literal_multi_use_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x, +; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W, +; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, +; EG-NEXT: MOV * T2.X, literal.y, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = or i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV * T0.Y, KC0[5].X, +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) + %or = or i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: scalar_or_inline_imm_multi_use_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x, +; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x, +; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W, +; EG-NEXT: MOV * T2.X, literal.x, +; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV * T3.Y, KC0[3].X, +; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) + %or = or i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + %foo = add i64 %b, 63 + store volatile i64 %foo, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_neg_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %or = or i64 %a, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_literal_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) + %loada = load i32, ptr addrspace(1) %a, align 4 + %or = or i32 %loada, 65535 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_inline_immediate_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) + %loada = load i32, ptr addrspace(1) %a, align 4 + %or = or i32 %loada, 4 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: scalar_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z, +; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %or = or i64 %a, %b + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].W, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: OR_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %loadb = load i64, ptr addrspace(1) %b, align 8 + %or = or i64 %loada, %loadb + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { +; EG-LABEL: scalar_vector_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X, +; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a + %or = or i64 %loada, %b + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_loadimm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x, +; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, 22470723082367 + store i64 %or, ptr addrspace(1) %out + ret void +} + +; FIXME: The or 0 should really be removed. +define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_imm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, 8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_neg_inline_imm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_neg_literal: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -200(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, -200 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; EG-LABEL: trunc_i64_or_to_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W, + %add = or i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; EG-LABEL: or_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].W, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, +; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, +; EG-NEXT: AND_INT T0.X, PV.W, 1, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load float, ptr addrspace(1) %in0 + %b = load float, ptr addrspace(1) %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 0.000000e+00 + %or = or i1 %acmp, %bcmp + %result = zext i1 %or to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; EG-LABEL: s_or_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y, +; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W, +; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PS, 1, +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %cmp0 = icmp eq i32 %a, %b + %cmp1 = icmp eq i32 %c, %d + %or = or i1 %cmp0, %cmp1 + store i1 %or, ptr addrspace(1) %out + ret void +} From c5b767c9e98b868ab74df132647e469f1d5be744 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 25 Jun 2025 09:38:06 -0500 Subject: [PATCH 13/25] Remove unhelpful commentary. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 6a764e9948136..da81e6764e795 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4063,10 +4063,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - // When the shl64_reduce optimisation code is passed through vector - // legalization some scalarising occurs. After ISD::AND was legalised, this - // resulted in the AND instructions no longer being elided, as mentioned - // below. The following code should make sure this takes place. if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue VAND = RHS.getOperand(0); if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { @@ -4312,10 +4308,6 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, SDLoc SL(N); unsigned RHSVal; - // When the shl64_reduce optimisation code is passed through vector - // legalization some scalarising occurs. After ISD::AND was legalised, this - // resulted in the AND instructions no longer being elided, as mentioned - // below. The following code should make sure this takes place. if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue VAND = RHS.getOperand(0); if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { From 3da5a3d065c32aea062b27dab60cd97826b1576d Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 25 Jun 2025 09:45:13 -0500 Subject: [PATCH 14/25] Remove unnecessary driveby clang-format --- llvm/lib/Target/AMDGPU/SIInstructions.td | 41 +++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4dc94afbcb7b5..3e3603cc9ef64 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2541,25 +2541,30 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { - def : GCNPat<(rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16), - /* clamp */ 0, /* op_sel */ 0)>; +def : GCNPat < + (rotr i32:$src0, i32:$src1), + (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src0, + /* src2_modifiers */ 0, + (EXTRACT_SUBREG $src1, lo16), + /* clamp */ 0, /* op_sel */ 0) +>; - def : GCNPat< - (i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), - (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ - (i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */ - (i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */ - (i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)), - /* clamp */ 0, /* op_sel */ 0)>; - - def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), - (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src1, - /* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16), - /* clamp */ 0, /* op_sel */ 0)>; +def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), + (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ + (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + 0, /* src1_modifiers */ + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), + 0, /* src2_modifiers */ + (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)), + /* clamp */ 0, /* op_sel */ 0)>; + +def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), + (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0, + /* src1_modifiers */ 0, $src1, + /* src2_modifiers */ 0, + (EXTRACT_SUBREG VGPR_32:$src2, lo16), + /* clamp */ 0, /* op_sel */ 0)>; } // end True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { From 8c91bfff73492c393da22b0f4998b70cb039baab Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 25 Jun 2025 09:48:58 -0500 Subject: [PATCH 15/25] Remove dead checks in xor.ll --- llvm/test/CodeGen/AMDGPU/xor.ll | 572 -------------------------------- 1 file changed, 572 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index d7e780a5ddf74..feb6ecd996516 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -76,36 +76,6 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: xor_v2i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: xor_v2i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %in0 %b = load <2 x i32>, ptr addrspace(1) %in1 @@ -160,46 +130,6 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_xor_b32_e32 v0, v0, v4 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm -; SIS-LABEL: xor_v4i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; SIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b32 s7, s7, s11 -; SIS-NEXT: s_xor_b32 s6, s6, s10 -; SIS-NEXT: s_xor_b32 s5, s5, s9 -; SIS-NEXT: s_xor_b32 s4, s4, s8 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: v_mov_b32_e32 v2, s6 -; SIS-NEXT: v_mov_b32_e32 v3, s7 -; SIS-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: xor_v4i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; VIS-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; VIS-NEXT: v_mov_b32_e32 v4, s0 -; VIS-NEXT: v_mov_b32_e32 v5, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s0, s7, s11 -; VIS-NEXT: s_xor_b32 s1, s6, s10 -; VIS-NEXT: s_xor_b32 s2, s5, s9 -; VIS-NEXT: s_xor_b32 s3, s4, s8 -; VIS-NEXT: v_mov_b32_e32 v0, s3 -; VIS-NEXT: v_mov_b32_e32 v1, s2 -; VIS-NEXT: v_mov_b32_e32 v2, s1 -; VIS-NEXT: v_mov_b32_e32 v3, s0 -; VIS-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VIS-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %in0 %b = load <4 x i32>, ptr addrspace(1) %in1 %result = xor <4 x i32> %a, %b @@ -255,44 +185,6 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: xor_i1: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SIS-NEXT: s_mov_b32 s7, 0xf000 -; SIS-NEXT: s_mov_b32 s6, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dword s8, s[2:3], 0x0 -; SIS-NEXT: s_load_dword s9, s[4:5], 0x0 -; SIS-NEXT: s_mov_b32 s4, s0 -; SIS-NEXT: s_mov_b32 s5, s1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s8, 0 -; SIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s9, 1.0 -; SIS-NEXT: v_mov_b32_e32 v0, s9 -; SIS-NEXT: v_mov_b32_e32 v1, s8 -; SIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; SIS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: xor_i1: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dword s6, s[2:3], 0x0 -; VIS-NEXT: s_load_dword s4, s[4:5], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: v_cmp_ge_f32_e64 s[0:1], s6, 0 -; VIS-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, 1.0 -; VIS-NEXT: v_mov_b32_e32 v2, s4 -; VIS-NEXT: v_mov_b32_e32 v3, s6 -; VIS-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] -; VIS-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VIS-NEXT: flat_store_dword v[0:1], v2 -; VIS-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 @@ -348,48 +240,6 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 ; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: v_xor_i1: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SIS-NEXT: s_mov_b32 s7, 0xf000 -; SIS-NEXT: s_mov_b32 s6, -1 -; SIS-NEXT: s_mov_b32 s14, s6 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_mov_b32 s12, s2 -; SIS-NEXT: s_mov_b32 s13, s3 -; SIS-NEXT: s_mov_b32 s15, s7 -; SIS-NEXT: s_mov_b32 s10, s6 -; SIS-NEXT: s_mov_b32 s11, s7 -; SIS-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc -; SIS-NEXT: s_waitcnt vmcnt(0) -; SIS-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc -; SIS-NEXT: s_waitcnt vmcnt(0) -; SIS-NEXT: s_mov_b32 s4, s0 -; SIS-NEXT: s_mov_b32 s5, s1 -; SIS-NEXT: v_xor_b32_e32 v0, v0, v1 -; SIS-NEXT: v_and_b32_e32 v0, 1, v0 -; SIS-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: v_xor_i1: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: v_mov_b32_e32 v0, s2 -; VIS-NEXT: v_mov_b32_e32 v1, s3 -; VIS-NEXT: v_mov_b32_e32 v2, s4 -; VIS-NEXT: v_mov_b32_e32 v3, s5 -; VIS-NEXT: flat_load_ubyte v4, v[0:1] glc -; VIS-NEXT: s_waitcnt vmcnt(0) -; VIS-NEXT: flat_load_ubyte v2, v[2:3] glc -; VIS-NEXT: s_waitcnt vmcnt(0) -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_xor_b32_e32 v2, v4, v2 -; VIS-NEXT: v_and_b32_e32 v2, 1, v2 -; VIS-NEXT: flat_store_byte v[0:1], v2 -; VIS-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %xor = xor i1 %a, %b @@ -437,34 +287,6 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_xor_i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dword s6, s[2:3], 0x0 -; SIS-NEXT: s_load_dword s4, s[4:5], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b32 s4, s6, s4 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_xor_i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dword s2, s[2:3], 0x0 -; VIS-NEXT: s_load_dword s3, s[4:5], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s0, s2, s3 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dword v[0:1], v2 -; VIS-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, %b @@ -496,28 +318,6 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s7, 0xf000 -; SIS-NEXT: s_mov_b32 s6, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_mov_b32 s4, s0 -; SIS-NEXT: s_xor_b32 s0, s2, s3 -; SIS-NEXT: s_mov_b32 s5, s1 -; SIS-NEXT: v_mov_b32_e32 v0, s0 -; SIS-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s2, s2, s3 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s2 -; VIS-NEXT: flat_store_dword v[0:1], v2 -; VIS-NEXT: s_endpgm %result = xor i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void @@ -547,28 +347,6 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_not_i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dword s6, s[4:5], 0xb -; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_not_b32 s4, s6 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_not_i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dword s2, s[4:5], 0x2c -; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_not_b32 s2, s2 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s2 -; VIS-NEXT: flat_store_dword v[0:1], v2 -; VIS-NEXT: s_endpgm %result = xor i32 %a, -1 store i32 %result, ptr addrspace(1) %out ret void @@ -606,30 +384,6 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_not_i32: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dword s4, s[2:3], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_not_b32 s4, s4 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_not_i32: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dword s2, s[2:3], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_not_b32 s0, s2 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dword v[0:1], v2 -; VIS-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in0 %b = load i32, ptr addrspace(1) %in1 %result = xor i32 %a, -1 @@ -679,36 +433,6 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_xor_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; SIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_xor_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, %b @@ -744,32 +468,6 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SIS-NEXT: s_mov_b32 s7, 0xf000 -; SIS-NEXT: s_mov_b32 s6, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_mov_b32 s4, s0 -; SIS-NEXT: s_mov_b32 s5, s1 -; SIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] -; SIS-NEXT: v_mov_b32_e32 v0, s0 -; SIS-NEXT: v_mov_b32_e32 v1, s1 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %result = xor i64 %a, %b store i64 %result, ptr addrspace(1) %out ret void @@ -801,30 +499,6 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_not_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s7, 0xf000 -; SIS-NEXT: s_mov_b32 s6, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_mov_b32 s4, s0 -; SIS-NEXT: s_mov_b32 s5, s1 -; SIS-NEXT: s_not_b64 s[0:1], s[2:3] -; SIS-NEXT: v_mov_b32_e32 v0, s0 -; SIS-NEXT: v_mov_b32_e32 v1, s1 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_not_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_not_b64 s[0:1], s[2:3] -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %result = xor i64 %a, -1 store i64 %result, ptr addrspace(1) %out ret void @@ -864,32 +538,6 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_not_b32_e32 v1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_not_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_not_b64 s[4:5], s[4:5] -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_not_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_not_b64 s[0:1], s[2:3] -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %in0 %b = load i64, ptr addrspace(1) %in1 %result = xor i64 %a, -1 @@ -956,57 +604,6 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; VI-NEXT: .LBB14_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_branch .LBB14_2 -; SIS-LABEL: xor_cf: -; SIS: ; %bb.0: ; %entry -; SIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 -; SIS-NEXT: s_mov_b64 s[10:11], 0 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 -; SIS-NEXT: s_and_b64 vcc, exec, s[8:9] -; SIS-NEXT: s_cbranch_vccz .LBB12_4 -; SIS-NEXT: ; %bb.1: ; %else -; SIS-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 -; SIS-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_mov_b64 vcc, vcc -; SIS-NEXT: s_cbranch_vccnz .LBB12_3 -; SIS-NEXT: .LBB12_2: ; %if -; SIS-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; SIS-NEXT: .LBB12_3: ; %endif -; SIS-NEXT: v_mov_b32_e32 v0, s8 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: v_mov_b32_e32 v1, s9 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; SIS-NEXT: .LBB12_4: -; SIS-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SIS-NEXT: s_branch .LBB12_2 -; VIS-LABEL: xor_cf: -; VIS: ; %bb.0: ; %entry -; VIS-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VIS-NEXT: s_mov_b64 s[8:9], 0 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VIS-NEXT: s_cbranch_scc0 .LBB12_4 -; VIS-NEXT: ; %bb.1: ; %else -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VIS-NEXT: s_cbranch_vccnz .LBB12_3 -; VIS-NEXT: .LBB12_2: ; %if -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] -; VIS-NEXT: .LBB12_3: ; %endif -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: v_mov_b32_e32 v2, s2 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_mov_b32_e32 v3, s3 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm -; VIS-NEXT: .LBB12_4: -; VIS-NEXT: ; implicit-def: $sgpr2_sgpr3 -; VIS-NEXT: s_branch .LBB12_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -1053,32 +650,6 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_literal_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b32 s4, s7, 0xf237b -; SIS-NEXT: s_xor_b32 s5, s6, 0x3039 -; SIS-NEXT: v_mov_b32_e32 v0, s5 -; SIS-NEXT: v_mov_b32_e32 v1, s4 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_literal_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s1, s1, 0xf237b -; VIS-NEXT: s_xor_b32 s0, s0, 0x3039 -; VIS-NEXT: v_mov_b32_e32 v2, s2 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_mov_b32_e32 v3, s3 -; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VIS-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out ret void @@ -1127,47 +698,6 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_literal_multi_use_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SIS-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x13 -; SIS-NEXT: s_movk_i32 s8, 0x3039 -; SIS-NEXT: s_mov_b32 s9, 0xf237b -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_add_u32 s0, s6, 0x3039 -; SIS-NEXT: s_addc_u32 s1, s7, 0xf237b -; SIS-NEXT: s_waitcnt expcnt(0) -; SIS-NEXT: v_mov_b32_e32 v0, s0 -; SIS-NEXT: v_mov_b32_e32 v1, s1 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_waitcnt vmcnt(0) -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_literal_multi_use_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c -; VIS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 -; VIS-NEXT: s_movk_i32 s6, 0x3039 -; VIS-NEXT: s_mov_b32 s7, 0xf237b -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; VIS-NEXT: v_mov_b32_e32 v0, s4 -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v1, s5 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: s_add_u32 s0, s2, 0x3039 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_addc_u32 s1, s3, 0xf237b -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[0:1] -; VIS-NEXT: s_waitcnt vmcnt(0) -; VIS-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out @@ -1202,30 +732,6 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_inline_imm_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b32 s4, s6, 63 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s7 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_inline_imm_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s0, s0, 63 -; VIS-NEXT: v_mov_b32_e32 v2, s2 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v3, s3 -; VIS-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VIS-NEXT: s_endpgm %or = xor i64 %a, 63 store i64 %or, ptr addrspace(1) %out ret void @@ -1257,30 +763,6 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm -; SIS-LABEL: scalar_xor_neg_inline_imm_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b64 s[4:5], s[6:7], -8 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: scalar_xor_neg_inline_imm_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VIS-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[0:1], s[0:1], -8 -; VIS-NEXT: v_mov_b32_e32 v0, s2 -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v1, s3 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %or = xor i64 %a, -8 store i64 %or, ptr addrspace(1) %out @@ -1321,32 +803,6 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; VI-NEXT: v_xor_b32_e32 v1, -1, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_xor_i64_neg_inline_imm: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b64 s[4:5], s[4:5], -8 -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_xor_i64_neg_inline_imm: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b64 s[0:1], s[2:3], -8 -; VIS-NEXT: v_mov_b32_e32 v3, s1 -; VIS-NEXT: v_mov_b32_e32 v2, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, -8 store i64 %or, ptr addrspace(1) %out @@ -1387,34 +843,6 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; SIS-LABEL: vector_xor_literal_i64: -; SIS: ; %bb.0: -; SIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SIS-NEXT: s_mov_b32 s3, 0xf000 -; SIS-NEXT: s_mov_b32 s2, -1 -; SIS-NEXT: s_waitcnt lgkmcnt(0) -; SIS-NEXT: s_xor_b32 s5, s5, 0x146f -; SIS-NEXT: s_xor_b32 s4, s4, 0xdf77987f -; SIS-NEXT: v_mov_b32_e32 v0, s4 -; SIS-NEXT: v_mov_b32_e32 v1, s5 -; SIS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SIS-NEXT: s_endpgm -; VIS-LABEL: vector_xor_literal_i64: -; VIS: ; %bb.0: -; VIS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VIS-NEXT: v_mov_b32_e32 v0, s0 -; VIS-NEXT: v_mov_b32_e32 v1, s1 -; VIS-NEXT: s_waitcnt lgkmcnt(0) -; VIS-NEXT: s_xor_b32 s0, s3, 0x146f -; VIS-NEXT: s_xor_b32 s1, s2, 0xdf77987f -; VIS-NEXT: v_mov_b32_e32 v2, s1 -; VIS-NEXT: v_mov_b32_e32 v3, s0 -; VIS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VIS-NEXT: s_endpgm %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, 22470723082367 From d60d0119487e7eafe1625940eb9ea04f7fd92bd8 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Wed, 25 Jun 2025 10:25:34 -0500 Subject: [PATCH 16/25] Remove unnnecessary node duplication --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index da81e6764e795..718c2bd208264 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4093,9 +4093,10 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, LHSAND, Zero); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); - SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); + SDValue LoAnd = + DAG.getNode(ISD::AND, SL, MVT::i32, Lo, RHSAND->getOperand(0)); + SDValue HiAnd = + DAG.getNode(ISD::AND, SL, MVT::i32, Hi, RHSAND->getOperand(0)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); if (AndIndex == 0 || AndIndex == 1) return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, From 1d3f7544b6ce3ce8264196fa5c4193132a8bdf17 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Thu, 26 Jun 2025 09:23:24 -0500 Subject: [PATCH 17/25] Modify allUsesHaveSourceMods() instead of foldFreeOpFromSelect() This prevents any regressions in feng-modifier-casting.ll. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 33 +++++++++---------- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 16 ++++----- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 718c2bd208264..3a24a8efaf2c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -725,6 +725,19 @@ static bool selectSupportsSourceMods(const SDNode *N) { return N->getValueType(0) == MVT::f32; } +LLVM_READONLY +static bool buildVectorSupportsSourceMods(const SDNode *N) { + if (N->getValueType(0) != MVT::v2f32) + return true; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS->getOpcode() != ISD::SELECT || RHS->getOpcode() != ISD::SELECT) + return true; + + return false; +} + // Most FP instructions support source modifiers, but this could be refined // slightly. LLVM_READONLY @@ -758,6 +771,8 @@ static bool hasSourceMods(const SDNode *N) { return true; } } + case ISD::BUILD_VECTOR: + return buildVectorSupportsSourceMods(N); case ISD::SELECT: return selectSupportsSourceMods(N); default: @@ -4865,24 +4880,6 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) return SDValue(); - // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be - // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled - // out in this case. For now I've made the logic as specific to the case as - // possible, hopefully this can be relaxed in future. - if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) { - SDValue LHSB = LHS.getOperand(0); - SDValue RHSB = RHS.getOperand(0); - if (LHSB.getOpcode() == ISD::BITCAST && - RHSB->getOpcode() == ISD::BITCAST) { - EVT LHSBOpTy = LHSB->getOperand(0).getValueType(); - EVT RHSBOpTy = RHSB->getOperand(0).getValueType(); - if (LHSB.getValueType() == MVT::f32 && - RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 && - RHSBOpTy == MVT::i32) - return SDValue(); - } - } - return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, RHS); } diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 94f41097b7aa1..5674ae328406d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1634,12 +1634,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_cselect_b32 s1, s1, s3 -; GFX7-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1658,10 +1658,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: s_cselect_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -1672,17 +1672,17 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x18 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_bitcmp1_b32 s6, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo ; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s1, s1, s3 ; GFX11-NEXT: s_cselect_b32 s0, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -v0, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm From 0ada80b20ab82db1e9aeda33247c649133a3c774 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Thu, 26 Jun 2025 10:23:56 -0500 Subject: [PATCH 18/25] Remove single-use variables from buildVectorSupportsSourceMods() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3a24a8efaf2c8..bab86dc225cc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -730,9 +730,7 @@ static bool buildVectorSupportsSourceMods(const SDNode *N) { if (N->getValueType(0) != MVT::v2f32) return true; - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS->getOpcode() != ISD::SELECT || RHS->getOpcode() != ISD::SELECT) + if (N->getOperand(0) != ISD::SELECT || N->getOperand(1) != ISD::SELECT) return true; return false; From 5a97e1c6d8e1cecf7d166ebbeeb31acafd69501e Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Thu, 26 Jun 2025 11:32:18 -0500 Subject: [PATCH 19/25] Correct failure to call getOpcode() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bab86dc225cc7..8055b44b804da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -730,7 +730,8 @@ static bool buildVectorSupportsSourceMods(const SDNode *N) { if (N->getValueType(0) != MVT::v2f32) return true; - if (N->getOperand(0) != ISD::SELECT || N->getOperand(1) != ISD::SELECT) + if (N->getOperand(0)->getOpcode() != ISD::SELECT || + N->getOperand(1)->getOpcode() != ISD::SELECT) return true; return false; From 573adfe87f3302d5a7a488e8053b5602808d8c15 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Fri, 18 Jul 2025 07:23:33 -0500 Subject: [PATCH 20/25] Work to fix regressions in integer select srcmod generation when v2i32 is made legal for or/xor/and. Complete fix of v2i32 in VOP SrcMod placement. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 36 ++- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 32 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 296 ++++++++++-------- llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll | 32 +- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 4 +- .../AMDGPU/integer-select-src-modifiers.ll | 34 +- 6 files changed, 255 insertions(+), 179 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index fe0e7eb279486..97fa9fd7742c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3059,36 +3059,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } + // v2i32 xor/or/and are legal. A vselect using these instructions as operands + // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek + // through this extract if possible. + auto getVectorBitWiseOp = [](SDValue S) -> SDValue { + if (S->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue VecOp = S->getOperand(0); + if (VecOp.getOpcode() == ISD::XOR || VecOp.getOpcode() == ISD::AND || + VecOp.getOpcode() == ISD::OR) + return VecOp; + } + return SDValue(); + }; + + SDValue Vec = getVectorBitWiseOp(Src); + SDValue BWSrc = Vec ? Vec : Src; // Convert various sign-bit masks to src mods. Currently disabled for 16-bit // types as the codegen replaces the operand without adding a srcmod. // This is intentionally finding the cases where we are performing float neg // and abs on int types, the goal is not to obtain two's complement neg or // abs. // TODO: Add 16-bit support. - unsigned Opc = Src->getOpcode(); + unsigned Opc = Vec ? Vec->getOpcode() : Src->getOpcode(); EVT VT = Src.getValueType(); if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64)) return true; - ConstantSDNode *CRHS = dyn_cast(Src->getOperand(1)); + ConstantSDNode *CRHS = + isConstOrConstSplat(Vec ? Vec->getOperand(1) : Src->getOperand(1)); if (!CRHS) return true; + auto ReplaceSrc = [&]() -> SDValue { + if (Vec) { + SDValue LHS = BWSrc->getOperand(0); + SDValue Index = Src->getOperand(1); + return Src = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src), + Src.getValueType(), LHS, Index); + } + return Src = BWSrc.getOperand(0); + }; + // Recognise (xor a, 0x80000000) as NEG SrcMod. // Recognise (and a, 0x7fffffff) as ABS SrcMod. // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers. if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) { Mods |= SISrcMods::NEG; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } else if (Opc == ISD::AND && AllowAbs && CRHS->getAPIntValue().isMaxSignedValue()) { Mods |= SISrcMods::ABS; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) { Mods |= SISrcMods::ABS; Mods |= SISrcMods::NEG; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8055b44b804da..dbd9bae93502b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4251,12 +4251,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, (ElementType.getSizeInBits() - 1)) { ShiftAmt = ShiftFullAmt; } else { - SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); + SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); const SDValue ShiftMask = DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); // This AND instruction will clamp out of bounds shift values. // It will also be removed during later instruction selection. - ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); + ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); } EVT ConcatType; @@ -4313,16 +4313,8 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, VT, Vec); } -SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SDValue RHS = N->getOperand(1); - ConstantSDNode *CRHS = dyn_cast(RHS); - EVT VT = N->getValueType(0); - SDValue LHS = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - unsigned RHSVal; - +static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) { + SDLoc SL = SDLoc(RHS); if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue VAND = RHS.getOperand(0); if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { @@ -4359,12 +4351,26 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); if (AndIndex == 0 || AndIndex == 1) return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc, - AndIndex == 0 ? LoAnd : HiAnd, N->getFlags()); + AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags()); } } } } } + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue RHS = N->getOperand(1); + ConstantSDNode *CRHS = dyn_cast(RHS); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + unsigned RHSVal; + + if (CRHS) { RHSVal = CRHS->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7b7a637046081..a61a7aad82e29 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13366,36 +13366,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - // Fold the fneg of a vselect into the v2 vselect operands. - // xor (vselect c, a, b), 0x80000000 -> - // bitcast (vselect c, (fneg (bitcast a)), (fneg (bitcast b))) - if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) { - - const ConstantSDNode *CRHS0 = dyn_cast(RHS.getOperand(0)); - const ConstantSDNode *CRHS1 = dyn_cast(RHS.getOperand(1)); - SDValue LHS_0 = LHS.getOperand(0); - SDValue LHS_1 = LHS.getOperand(1); - - if (LHS.getOpcode() == ISD::VSELECT && CRHS0 && - CRHS0->getAPIntValue().isSignMask() && - shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 && - CRHS1->getAPIntValue().isSignMask() && - shouldFoldFNegIntoSrc(N, LHS_1)) { - - SDLoc DL(N); - SDValue CastLHS = - DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1)); - SDValue CastRHS = - DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2)); - SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS); - SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS); - SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32, - LHS->getOperand(0), FNegLHS, FNegRHS); - return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); - } - } - - const ConstantSDNode *CRHS = dyn_cast(RHS); + const ConstantSDNode *CRHS = isConstOrConstSplat(RHS); if (CRHS && VT == MVT::i64) { if (SDValue Split = @@ -13403,6 +13374,23 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return Split; } + // v2i32 (xor (vselect cc, x, y), K) -> + // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be + // replaced with source modifiers when the select is lowered to CNDMASK. + // TODO REMOVE: prevents regressions in fneg-modifier-casting.ll + unsigned Opc = LHS.getOpcode(); + if(((Opc == ISD::VSELECT && VT==MVT::v2i32) || (Opc == ISD::SELECT && VT==MVT::i64)) && CRHS && CRHS->getAPIntValue().isSignMask()) { + SDValue CC = LHS->getOperand(0); + SDValue TRUE = LHS->getOperand(1); + SDValue FALSE = LHS->getOperand(2); + SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS); + SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS); + SDValue XSelect = DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse); + return XSelect; + } + + + // Make sure to apply the 64-bit constant splitting fold before trying to fold // fneg-like xors into 64-bit select. if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { @@ -14367,125 +14355,165 @@ bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { EltSize, NumElem, Idx->isDivergent(), getSubtarget()); } -SDValue -SITargetLowering::performExtractVectorEltCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; +// SDValue +// SITargetLowering::performBuildVectorCombine(SDNode *N, +// DAGCombinerInfo &DCI) const { +// // if (N->use_empty()) +// // return SDValue(); - EVT VecVT = Vec.getValueType(); - EVT VecEltVT = VecVT.getVectorElementType(); - EVT ResVT = N->getValueType(0); +// // if(!N->getValueType(0).isFloatingPoint()) +// // return SDValue(); - unsigned VecSize = VecVT.getSizeInBits(); - unsigned VecEltSize = VecEltVT.getSizeInBits(); +// // SelectionDAG &DAG = DCI.DAG; - if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && - allUsesHaveSourceMods(N)) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - SDValue Elt = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); - return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); - } - - // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) - // => - // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) - // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) - // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt - if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - unsigned Opc = Vec.getOpcode(); +// // // Iterate the operands. Check if source modifier. If so, propogate the +// // source +// // // modifier to the user and the srcmod from the BUILD_VECTOR element. +// // for (unsigned I = 0; I < N->getNumOperands(); I++) { +// // SDValue E = N->getOperand(I); +// // if (E->getOpcode() != ISD::FNEG && E->getOpcode() != ISD::ABS) +// // continue; - switch (Opc) { - default: - break; - // TODO: Support other binary operations. - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::ADD: - case ISD::UMIN: - case ISD::UMAX: - case ISD::SMIN: - case ISD::SMAX: - case ISD::FMAXNUM: - case ISD::FMINNUM: - case ISD::FMAXNUM_IEEE: - case ISD::FMINNUM_IEEE: - case ISD::FMAXIMUM: - case ISD::FMINIMUM: { - SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, - Vec.getOperand(0), Idx); - SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, - Vec.getOperand(1), Idx); - - DCI.AddToWorklist(Elt0.getNode()); - DCI.AddToWorklist(Elt1.getNode()); - return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); - } - } - } - - // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) - if (shouldExpandVectorDynExt(N)) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - SDValue V; - for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { - SDValue IC = DAG.getVectorIdxConstant(I, SL); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); - if (I == 0) - V = Elt; - else - V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); +// // // Users through which we can propogate will include users of +// // // extract_element on this vector, so need to peek-through. +// // } + +// // SmallVector UsersToModify; + +// // // If the use of the BUILD_VECTOR supports source mods it can be +// // propogated. for (SDNode *U : N->users()) { +// // if(!U->getOpcode() == ISD::EXTRACT_VECTOR_ELT) +// // if (!allUsesHaveSourceMods(U)) +// // continue; +// // UsersToModify.push_back(U); +// // } + +// // for(auto Node: UsersToModify) { + +// // } + +// return SDValue(); +// } + + SDValue SITargetLowering::performExtractVectorEltCombine( + SDNode * N, DAGCombinerInfo & DCI) const { + SDValue Vec = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; + + EVT VecVT = Vec.getValueType(); + EVT VecEltVT = VecVT.getVectorElementType(); + EVT ResVT = N->getValueType(0); + + unsigned VecSize = VecVT.getSizeInBits(); + unsigned VecEltSize = VecEltVT.getSizeInBits(); + + if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && + allUsesHaveSourceMods(N)) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, + Vec.getOperand(0), Idx); + return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); + } + + // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) + // => + // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) + // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) + // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt + if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + unsigned Opc = Vec.getOpcode(); + + switch (Opc) { + default: + break; + // TODO: Support other binary operations. + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::ADD: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::FMAXNUM: + case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: { + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, + Vec.getOperand(0), Idx); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, + Vec.getOperand(1), Idx); + + DCI.AddToWorklist(Elt0.getNode()); + DCI.AddToWorklist(Elt1.getNode()); + return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); + } + } } - return V; - } - if (!DCI.isBeforeLegalize()) - return SDValue(); + // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) + if (shouldExpandVectorDynExt(N)) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getVectorIdxConstant(I, SL); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); + } + return V; + } - // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit - // elements. This exposes more load reduction opportunities by replacing - // multiple small extract_vector_elements with a single 32-bit extract. - auto *Idx = dyn_cast(N->getOperand(1)); - if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && - VecSize > 32 && VecSize % 32 == 0 && Idx) { - EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); - - unsigned BitIndex = Idx->getZExtValue() * VecEltSize; - unsigned EltIdx = BitIndex / 32; - unsigned LeftoverBitIdx = BitIndex % 32; - SDLoc SL(N); + if (!DCI.isBeforeLegalize()) + return SDValue(); - SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); - DCI.AddToWorklist(Cast.getNode()); + // Try to turn sub-dword accesses of vectors into accesses of the same + // 32-bit elements. This exposes more load reduction opportunities by + // replacing multiple small extract_vector_elements with a single 32-bit + // extract. + auto *Idx = dyn_cast(N->getOperand(1)); + if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && + VecSize > 32 && VecSize % 32 == 0 && Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * VecEltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, - DAG.getConstant(EltIdx, SL, MVT::i32)); - DCI.AddToWorklist(Elt.getNode()); - SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, - DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); - DCI.AddToWorklist(Srl.getNode()); + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); - EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); - DCI.AddToWorklist(Trunc.getNode()); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); - if (VecEltVT == ResVT) { - return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); + EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); + DCI.AddToWorklist(Trunc.getNode()); + + if (VecEltVT == ResVT) { + return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); + } + + assert(ResVT.isScalarInteger()); + return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); } - assert(ResVT.isScalarInteger()); - return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); + return SDValue(); } - return SDValue(); -} - SDValue SITargetLowering::performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const { diff --git a/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll index 78942bfc68d63..a06b724e1529c 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll @@ -112,8 +112,10 @@ define <2 x i64> @ashr_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) { ; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3 +; CHECK-NEXT: v_and_b32_e32 v2, 31, v8 +; CHECK-NEXT: v_and_b32_e32 v0, 31, v6 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v3, v4 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -145,8 +147,10 @@ define <2 x i64> @ashr_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) { ; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3 +; CHECK-NEXT: v_and_b32_e32 v2, 31, v8 +; CHECK-NEXT: v_and_b32_e32 v0, 31, v6 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v3, v4 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -390,9 +394,11 @@ define <2 x i64> @ashr_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) { ; CHECK-LABEL: ashr_v2_or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 31, v6 +; CHECK-NEXT: v_and_b32_e32 v0, 31, v4 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v3 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %or = or <2 x i64> %shift_amt, splat (i64 32) @@ -465,13 +471,17 @@ define <2 x i64> @ashr_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shif ; CHECK-LABEL: ashr_v2_or32_sgpr: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s4, s17, s20 -; CHECK-NEXT: s_ashr_i32 s5, s17, 31 -; CHECK-NEXT: s_ashr_i32 s6, s19, s22 +; CHECK-NEXT: s_mov_b32 s4, 31 +; CHECK-NEXT: s_mov_b32 s21, s22 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_and_b64 s[4:5], s[20:21], s[4:5] +; CHECK-NEXT: s_ashr_i32 s6, s17, 31 ; CHECK-NEXT: s_ashr_i32 s7, s19, 31 +; CHECK-NEXT: s_ashr_i32 s4, s17, s4 +; CHECK-NEXT: s_ashr_i32 s5, s19, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_setpc_b64 s[30:31] %or = or <2 x i64> %shift_amt, splat (i64 32) diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index ca8f7736f6093..8c1b2d29a541f 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -159,7 +159,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] ; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] ; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 @@ -168,7 +168,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3] ; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] ; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll index b3c7ac80dd014..23ebfb817096f 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll @@ -341,21 +341,24 @@ define <2 x i32> @s_fneg_select_v2i32_1(<2 x i32> inreg %cond, <2 x i32> inreg % ; GCN-LABEL: s_fneg_select_v2i32_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s4, s19, 0x80000000 -; GCN-NEXT: s_xor_b32 s5, s18, 0x80000000 +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_xor_b64 s[4:5], s[18:19], s[4:5] ; GCN-NEXT: s_cmp_eq_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s5, s5, s20 +; GCN-NEXT: s_cselect_b32 s4, s4, s20 ; GCN-NEXT: s_cmp_eq_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s21 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_cselect_b32 s5, s5, s21 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_fneg_select_v2i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_brev_b32 s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, s2, s16 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 @@ -373,12 +376,13 @@ define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> in ; GCN-LABEL: s_fneg_fabs_select_v2i32_2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_bitset1_b32 s19, 31 -; GCN-NEXT: s_bitset1_b32 s18, 31 +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] ; GCN-NEXT: s_cmp_eq_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s4, s20, s18 +; GCN-NEXT: s_cselect_b32 s4, s20, s4 ; GCN-NEXT: s_cmp_eq_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s5, s21, s19 +; GCN-NEXT: s_cselect_b32 s5, s21, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -386,8 +390,10 @@ define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> in ; GFX11-LABEL: s_fneg_fabs_select_v2i32_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s3, 31 -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_brev_b32 s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, s16, s2 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 From 46786e7fc361349755081d7f2ca1ec49e0f1805a Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 21 Jul 2025 09:06:41 -0500 Subject: [PATCH 21/25] Fix 64-bit ashr scalarisation of and for fold int 32-bit shift Factor shift reducing combine logic into one function as it was applied in all three shift combine functions. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 123 +++------ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 250 ++++++++---------- llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll | 32 +-- 3 files changed, 155 insertions(+), 250 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index dbd9bae93502b..bc609032c5e1b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -725,18 +725,6 @@ static bool selectSupportsSourceMods(const SDNode *N) { return N->getValueType(0) == MVT::f32; } -LLVM_READONLY -static bool buildVectorSupportsSourceMods(const SDNode *N) { - if (N->getValueType(0) != MVT::v2f32) - return true; - - if (N->getOperand(0)->getOpcode() != ISD::SELECT || - N->getOperand(1)->getOpcode() != ISD::SELECT) - return true; - - return false; -} - // Most FP instructions support source modifiers, but this could be refined // slightly. LLVM_READONLY @@ -770,8 +758,6 @@ static bool hasSourceMods(const SDNode *N) { return true; } } - case ISD::BUILD_VECTOR: - return buildVectorSupportsSourceMods(N); case ISD::SELECT: return selectSupportsSourceMods(N); default: @@ -4068,15 +4054,21 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } -SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - EVT VT = N->getValueType(0); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - ConstantSDNode *CRHS = dyn_cast(RHS); - SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; +// Part of the shift combines is to optimise for the case where its possible +// to reduce e.g shl64 to shl32 if shift range is [63-32]. This +// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The +// '&' is then elided by ISel. The vector code for this was being +// completely scalarised by the vector legalizer, but when v2i32 is +// legal the vector legaliser only partially scalarises the +// vector operations and the and is not elided. This function +// scalarises the AND for this optimisation case. +static SDValue getShiftForReduction(unsigned ShiftOpc, SDValue LHS, SDValue RHS, + SelectionDAG &DAG) { + assert( + (ShiftOpc == ISD::SRA || ShiftOpc == ISD::SRL || ShiftOpc == ISD::SHL) && + "Expected shift Opcode."); + SDLoc SL = SDLoc(RHS); if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue VAND = RHS.getOperand(0); if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { @@ -4085,15 +4077,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue LHSAND = VAND.getOperand(0); SDValue RHSAND = VAND.getOperand(1); if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) { - // Part of shlcombine is to optimise for the case where its possible - // to reduce shl64 to shl32 if shift range is [63-32]. This - // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The - // '&' is then elided by ISel. The vector code for this was being - // completely scalarised by the vector legalizer, but now v2i32 is - // made legal the vector legaliser only partially scalarises the - // vector operations and the and was not elided. This check enables us - // to locate and scalarise the v2i32 and and re-enable ISel to elide - // the and instruction. ConstantSDNode *CANDL = dyn_cast(RHSAND->getOperand(0)); ConstantSDNode *CANDR = @@ -4107,19 +4090,33 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, LHSAND, Zero); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); - SDValue LoAnd = - DAG.getNode(ISD::AND, SL, MVT::i32, Lo, RHSAND->getOperand(0)); - SDValue HiAnd = - DAG.getNode(ISD::AND, SL, MVT::i32, Hi, RHSAND->getOperand(0)); + SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); if (AndIndex == 0 || AndIndex == 1) - return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, - AndIndex == 0 ? LoAnd : HiAnd, N->getFlags()); + return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc, + AndIndex == 0 ? LoAnd : HiAnd, + RHS->getFlags()); } } } } } + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + ConstantSDNode *CRHS = dyn_cast(RHS); + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + + if (SDValue SS = getShiftForReduction(ISD::SHL, LHS, RHS, DAG)) + return SS; unsigned RHSVal; if (CRHS) { @@ -4221,6 +4218,9 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); + if (SDValue SS = getShiftForReduction(ISD::SRA, LHS, RHS, DAG)) + return SS; + if (VT.getScalarType() != MVT::i64) return SDValue(); @@ -4313,52 +4313,6 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, VT, Vec); } -static SDValue getScalarisedShift(SDValue LHS, SDValue RHS, SelectionDAG &DAG) { - SDLoc SL = SDLoc(RHS); - if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - SDValue VAND = RHS.getOperand(0); - if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { - uint64_t AndIndex = RHS->getConstantOperandVal(1); - if (VAND->getOpcode() == ISD::AND && CRRHS) { - SDValue LHSAND = VAND.getOperand(0); - SDValue RHSAND = VAND.getOperand(1); - if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) { - // Part of srlcombine is to optimise for the case where its possible - // to reduce shl64 to shl32 if shift range is [63-32]. This - // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The - // '&' is then elided by ISel. The vector code for this was being - // completely scalarised by the vector legalizer, but now v2i32 is - // made legal the vector legaliser only partially scalarises the - // vector operations and the and was not elided. This check enables us - // to locate and scalarise the v2i32 and and re-enable ISel to elide - // the and instruction. - ConstantSDNode *CANDL = - dyn_cast(RHSAND->getOperand(0)); - ConstantSDNode *CANDR = - dyn_cast(RHSAND->getOperand(1)); - if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f && - RHSAND->getConstantOperandVal(1) == 0x1f) { - // Get the non-const AND operands and produce scalar AND - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, - LHSAND, Zero); - SDValue Hi = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); - SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); - if (AndIndex == 0 || AndIndex == 1) - return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc, - AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags()); - } - } - } - } - } - return SDValue(); -} SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -4370,7 +4324,8 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, SDLoc SL(N); unsigned RHSVal; - + if (SDValue SS = getShiftForReduction(ISD::SRL, LHS, RHS, DAG)) + return SS; if (CRHS) { RHSVal = CRHS->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a61a7aad82e29..4074fe288db68 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13377,7 +13377,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, // v2i32 (xor (vselect cc, x, y), K) -> // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be // replaced with source modifiers when the select is lowered to CNDMASK. - // TODO REMOVE: prevents regressions in fneg-modifier-casting.ll unsigned Opc = LHS.getOpcode(); if(((Opc == ISD::VSELECT && VT==MVT::v2i32) || (Opc == ISD::SELECT && VT==MVT::i64)) && CRHS && CRHS->getAPIntValue().isSignMask()) { SDValue CC = LHS->getOperand(0); @@ -14355,165 +14354,126 @@ bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { EltSize, NumElem, Idx->isDivergent(), getSubtarget()); } -// SDValue -// SITargetLowering::performBuildVectorCombine(SDNode *N, -// DAGCombinerInfo &DCI) const { -// // if (N->use_empty()) -// // return SDValue(); - -// // if(!N->getValueType(0).isFloatingPoint()) -// // return SDValue(); - -// // SelectionDAG &DAG = DCI.DAG; - -// // // Iterate the operands. Check if source modifier. If so, propogate the -// // source -// // // modifier to the user and the srcmod from the BUILD_VECTOR element. -// // for (unsigned I = 0; I < N->getNumOperands(); I++) { -// // SDValue E = N->getOperand(I); -// // if (E->getOpcode() != ISD::FNEG && E->getOpcode() != ISD::ABS) -// // continue; - -// // // Users through which we can propogate will include users of -// // // extract_element on this vector, so need to peek-through. -// // } - -// // SmallVector UsersToModify; - -// // // If the use of the BUILD_VECTOR supports source mods it can be -// // propogated. for (SDNode *U : N->users()) { -// // if(!U->getOpcode() == ISD::EXTRACT_VECTOR_ELT) -// // if (!allUsesHaveSourceMods(U)) -// // continue; -// // UsersToModify.push_back(U); -// // } - -// // for(auto Node: UsersToModify) { - -// // } - -// return SDValue(); -// } - - SDValue SITargetLowering::performExtractVectorEltCombine( - SDNode * N, DAGCombinerInfo & DCI) const { - SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; - - EVT VecVT = Vec.getValueType(); - EVT VecEltVT = VecVT.getVectorElementType(); - EVT ResVT = N->getValueType(0); +SDValue +SITargetLowering::performExtractVectorEltCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; - unsigned VecSize = VecVT.getSizeInBits(); - unsigned VecEltSize = VecEltVT.getSizeInBits(); + EVT VecVT = Vec.getValueType(); + EVT VecEltVT = VecVT.getVectorElementType(); + EVT ResVT = N->getValueType(0); - if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && - allUsesHaveSourceMods(N)) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, - Vec.getOperand(0), Idx); - return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); - } - - // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) - // => - // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) - // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) - // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt - if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - unsigned Opc = Vec.getOpcode(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned VecEltSize = VecEltVT.getSizeInBits(); - switch (Opc) { - default: - break; - // TODO: Support other binary operations. - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::ADD: - case ISD::UMIN: - case ISD::UMAX: - case ISD::SMIN: - case ISD::SMAX: - case ISD::FMAXNUM: - case ISD::FMINNUM: - case ISD::FMAXNUM_IEEE: - case ISD::FMINNUM_IEEE: - case ISD::FMAXIMUM: - case ISD::FMINIMUM: { - SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, - Vec.getOperand(0), Idx); - SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, - Vec.getOperand(1), Idx); - - DCI.AddToWorklist(Elt0.getNode()); - DCI.AddToWorklist(Elt1.getNode()); - return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); - } - } - } + if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && + allUsesHaveSourceMods(N)) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + SDValue Elt = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); + return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); + } + + // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) + // => + // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) + // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) + // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt + if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + unsigned Opc = Vec.getOpcode(); - // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) - if (shouldExpandVectorDynExt(N)) { - SDLoc SL(N); - SDValue Idx = N->getOperand(1); - SDValue V; - for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { - SDValue IC = DAG.getVectorIdxConstant(I, SL); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); - if (I == 0) - V = Elt; - else - V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); - } - return V; + switch (Opc) { + default: + break; + // TODO: Support other binary operations. + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::ADD: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::FMAXNUM: + case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: { + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, + Vec.getOperand(0), Idx); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, + Vec.getOperand(1), Idx); + + DCI.AddToWorklist(Elt0.getNode()); + DCI.AddToWorklist(Elt1.getNode()); + return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); + } + } + } + + // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) + if (shouldExpandVectorDynExt(N)) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getVectorIdxConstant(I, SL); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); } + return V; + } - if (!DCI.isBeforeLegalize()) - return SDValue(); - - // Try to turn sub-dword accesses of vectors into accesses of the same - // 32-bit elements. This exposes more load reduction opportunities by - // replacing multiple small extract_vector_elements with a single 32-bit - // extract. - auto *Idx = dyn_cast(N->getOperand(1)); - if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && - VecSize > 32 && VecSize % 32 == 0 && Idx) { - EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); - - unsigned BitIndex = Idx->getZExtValue() * VecEltSize; - unsigned EltIdx = BitIndex / 32; - unsigned LeftoverBitIdx = BitIndex % 32; - SDLoc SL(N); + if (!DCI.isBeforeLegalize()) + return SDValue(); - SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); - DCI.AddToWorklist(Cast.getNode()); + // Try to turn sub-dword accesses of vectors into accesses of the same + // 32-bit elements. This exposes more load reduction opportunities by + // replacing multiple small extract_vector_elements with a single 32-bit + // extract. + auto *Idx = dyn_cast(N->getOperand(1)); + if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && + VecSize > 32 && VecSize % 32 == 0 && Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * VecEltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, - DAG.getConstant(EltIdx, SL, MVT::i32)); - DCI.AddToWorklist(Elt.getNode()); - SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, - DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); - DCI.AddToWorklist(Srl.getNode()); + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); - EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); - DCI.AddToWorklist(Trunc.getNode()); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); - if (VecEltVT == ResVT) { - return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); - } + EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); + DCI.AddToWorklist(Trunc.getNode()); - assert(ResVT.isScalarInteger()); - return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); + if (VecEltVT == ResVT) { + return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); } - return SDValue(); + assert(ResVT.isScalarInteger()); + return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); } + return SDValue(); +} + SDValue SITargetLowering::performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const { diff --git a/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll index a06b724e1529c..78942bfc68d63 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr64_reduce.ll @@ -112,10 +112,8 @@ define <2 x i64> @ashr_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) { ; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 31, v8 -; CHECK-NEXT: v_and_b32_e32 v0, 31, v6 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v3, v4 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -147,10 +145,8 @@ define <2 x i64> @ashr_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) { ; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 31, v8 -; CHECK-NEXT: v_and_b32_e32 v0, 31, v6 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v6, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v8, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: v_mov_b32_e32 v3, v4 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -394,11 +390,9 @@ define <2 x i64> @ashr_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) { ; CHECK-LABEL: ashr_v2_or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 31, v6 -; CHECK-NEXT: v_and_b32_e32 v0, 31, v4 -; CHECK-NEXT: v_ashrrev_i32_e32 v0, v0, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v2, v2, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v3 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %or = or <2 x i64> %shift_amt, splat (i64 32) @@ -471,17 +465,13 @@ define <2 x i64> @ashr_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shif ; CHECK-LABEL: ashr_v2_or32_sgpr: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 31 -; CHECK-NEXT: s_mov_b32 s21, s22 -; CHECK-NEXT: s_mov_b32 s5, s4 -; CHECK-NEXT: s_and_b64 s[4:5], s[20:21], s[4:5] -; CHECK-NEXT: s_ashr_i32 s6, s17, 31 +; CHECK-NEXT: s_ashr_i32 s4, s17, s20 +; CHECK-NEXT: s_ashr_i32 s5, s17, 31 +; CHECK-NEXT: s_ashr_i32 s6, s19, s22 ; CHECK-NEXT: s_ashr_i32 s7, s19, 31 -; CHECK-NEXT: s_ashr_i32 s4, s17, s4 -; CHECK-NEXT: s_ashr_i32 s5, s19, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_setpc_b64 s[30:31] %or = or <2 x i64> %shift_amt, splat (i64 32) From 7a6fe7951a28f97224b1e767e137449aff1dfe1b Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 21 Jul 2025 11:09:29 -0500 Subject: [PATCH 22/25] Tidy up getShiftForReduction() --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 69 ++++++++++--------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bc609032c5e1b..0645abffa1545 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4069,40 +4069,41 @@ static SDValue getShiftForReduction(unsigned ShiftOpc, SDValue LHS, SDValue RHS, "Expected shift Opcode."); SDLoc SL = SDLoc(RHS); - if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - SDValue VAND = RHS.getOperand(0); - if (ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1))) { - uint64_t AndIndex = RHS->getConstantOperandVal(1); - if (VAND->getOpcode() == ISD::AND && CRRHS) { - SDValue LHSAND = VAND.getOperand(0); - SDValue RHSAND = VAND.getOperand(1); - if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) { - ConstantSDNode *CANDL = - dyn_cast(RHSAND->getOperand(0)); - ConstantSDNode *CANDR = - dyn_cast(RHSAND->getOperand(1)); - if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f && - RHSAND->getConstantOperandVal(1) == 0x1f) { - // Get the non-const AND operands and produce scalar AND - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, - LHSAND, Zero); - SDValue Hi = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); - SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); - SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); - SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); - if (AndIndex == 0 || AndIndex == 1) - return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc, - AndIndex == 0 ? LoAnd : HiAnd, - RHS->getFlags()); - } - } - } - } - } + if (RHS->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue VAND = RHS.getOperand(0); + if (VAND->getOpcode() != ISD::AND) + return SDValue(); + + ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1)); + if (!CRRHS) + return SDValue(); + + SDValue LHSAND = VAND.getOperand(0); + SDValue RHSAND = VAND.getOperand(1); + if (RHSAND->getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + ConstantSDNode *CANDL = dyn_cast(RHSAND->getOperand(0)); + ConstantSDNode *CANDR = dyn_cast(RHSAND->getOperand(1)); + if (!CANDL || !CANDR || RHSAND->getConstantOperandVal(0) != 0x1f || + RHSAND->getConstantOperandVal(1) != 0x1f) + return SDValue(); + // Get the non-const AND operands and produce scalar AND + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); + SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + uint64_t AndIndex = RHS->getConstantOperandVal(1); + if (AndIndex == 0 || AndIndex == 1) + return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc, + AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags()); + return SDValue(); } From 09d745adf2b570dc4aade709f7f8e2c169888c26 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 21 Jul 2025 11:18:50 -0500 Subject: [PATCH 23/25] Remove driveby formatting fixes --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 ++--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +++------ llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0645abffa1545..0391cbf89e30d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4314,7 +4314,6 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, VT, Vec); } - SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue RHS = N->getOperand(1); @@ -4841,8 +4840,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) return SDValue(); - return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, - RHS); + return distributeOpThroughSelect(DCI, LHS.getOpcode(), + SDLoc(N), Cond, LHS, RHS); } bool Inv = false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4074fe288db68..6ab96b772cd82 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13388,8 +13388,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return XSelect; } - - // Make sure to apply the 64-bit constant splitting fold before trying to fold // fneg-like xors into 64-bit select. if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { @@ -14435,10 +14433,9 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N, if (!DCI.isBeforeLegalize()) return SDValue(); - // Try to turn sub-dword accesses of vectors into accesses of the same - // 32-bit elements. This exposes more load reduction opportunities by - // replacing multiple small extract_vector_elements with a single 32-bit - // extract. + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit + // elements. This exposes more load reduction opportunities by replacing + // multiple small extract_vector_elements with a single 32-bit extract. auto *Idx = dyn_cast(N->getOperand(1)); if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && VecSize > 32 && VecSize % 32 == 0 && Idx) { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3e3603cc9ef64..196675b7362fb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1903,6 +1903,7 @@ def : GCNPat < >; } + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ From d789ecef007ba6fd7939277f7be69e569108f999 Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Mon, 21 Jul 2025 11:28:41 -0500 Subject: [PATCH 24/25] Fix formatting of xorcombine - how did this regress? --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6ab96b772cd82..5c29d6531e8dd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13378,16 +13378,19 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be // replaced with source modifiers when the select is lowered to CNDMASK. unsigned Opc = LHS.getOpcode(); - if(((Opc == ISD::VSELECT && VT==MVT::v2i32) || (Opc == ISD::SELECT && VT==MVT::i64)) && CRHS && CRHS->getAPIntValue().isSignMask()) { + if (((Opc == ISD::VSELECT && VT == MVT::v2i32) || + (Opc == ISD::SELECT && VT == MVT::i64)) && + CRHS && CRHS->getAPIntValue().isSignMask()) { SDValue CC = LHS->getOperand(0); SDValue TRUE = LHS->getOperand(1); SDValue FALSE = LHS->getOperand(2); SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS); SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS); - SDValue XSelect = DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse); + SDValue XSelect = + DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse); return XSelect; } - + // Make sure to apply the 64-bit constant splitting fold before trying to fold // fneg-like xors into 64-bit select. if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { From cc8652e292632311b4553b82c647f037c0602ffb Mon Sep 17 00:00:00 2001 From: Chris Jackson Date: Tue, 22 Jul 2025 06:09:33 -0500 Subject: [PATCH 25/25] Simpify SelectVOP3ModsImpl --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 97fa9fd7742c4..e0488f0f16e9b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3061,44 +3061,34 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, // v2i32 xor/or/and are legal. A vselect using these instructions as operands // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek - // through this extract if possible. - auto getVectorBitWiseOp = [](SDValue S) -> SDValue { - if (S->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - SDValue VecOp = S->getOperand(0); - if (VecOp.getOpcode() == ISD::XOR || VecOp.getOpcode() == ISD::AND || - VecOp.getOpcode() == ISD::OR) - return VecOp; - } - return SDValue(); - }; + // through the extract to the bitwise op. - SDValue Vec = getVectorBitWiseOp(Src); - SDValue BWSrc = Vec ? Vec : Src; + SDValue PeekSrc = Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src; // Convert various sign-bit masks to src mods. Currently disabled for 16-bit // types as the codegen replaces the operand without adding a srcmod. // This is intentionally finding the cases where we are performing float neg // and abs on int types, the goal is not to obtain two's complement neg or // abs. // TODO: Add 16-bit support. - unsigned Opc = Vec ? Vec->getOpcode() : Src->getOpcode(); + unsigned Opc = PeekSrc.getOpcode(); EVT VT = Src.getValueType(); if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64)) return true; ConstantSDNode *CRHS = - isConstOrConstSplat(Vec ? Vec->getOperand(1) : Src->getOperand(1)); + isConstOrConstSplat(PeekSrc ? PeekSrc->getOperand(1) : Src->getOperand(1)); if (!CRHS) return true; auto ReplaceSrc = [&]() -> SDValue { - if (Vec) { - SDValue LHS = BWSrc->getOperand(0); + if (Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue LHS = PeekSrc->getOperand(0); SDValue Index = Src->getOperand(1); return Src = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src), Src.getValueType(), LHS, Index); } - return Src = BWSrc.getOperand(0); + return Src = PeekSrc.getOperand(0); }; // Recognise (xor a, 0x80000000) as NEG SrcMod.