diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 91df516b80857..01887d5021c9b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1411,15 +1411,85 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { Opc != AMDGPU::V_CNDMASK_B64_PSEUDO) return false; + // Try to find optimized Y == Const ? Const : Z. If Const can't be directly + // encoded in the cndmask, try to reuse a register already holding the Const + // value from the comparison instruction. + auto tryFoldCndMaskCmp = + [&](MachineOperand *SrcOp, std::optional SrcImm, + ArrayRef CmpOpcodes, AMDGPU::OpName CmpValName) -> bool { + // We'll try to process only register operands with known values. + if (!SrcImm || !SrcOp->isReg()) + return false; + + // Find the predicate of the cndmask instruction. + MachineOperand *PredOp = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (!PredOp || !PredOp->isReg()) + return false; + + MachineInstr *PredI = MRI->getVRegDef(PredOp->getReg()); + if (!PredI || !PredI->isCompare()) + return false; + + if (!is_contained(CmpOpcodes, PredI->getOpcode())) + return false; + + // Check if the immediate value of the source operand matches the immediate + // value of either the first or second operand of the comparison + // instruction. + MachineOperand *SubstOp = nullptr; + std::optional CmpValImm = getImmOrMaterializedImm( + *TII->getNamedOperand(*PredI, AMDGPU::OpName::src0)); + if (CmpValImm && *CmpValImm == *SrcImm) { + SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src1); + } else { + CmpValImm = getImmOrMaterializedImm( + *TII->getNamedOperand(*PredI, AMDGPU::OpName::src1)); + if (CmpValImm && *CmpValImm == *SrcImm) { + SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src0); + } else { + return false; + } + } + + if (!SubstOp || !SubstOp->isReg()) + return false; + + // Do not substitute +0/-0 imm with register from comparison. + if ((CmpValImm == 0 || CmpValImm == 0xffffffff80000000) && + AMDGPU::isSISrcFPOperand(PredI->getDesc(), SubstOp->getOperandNo())) + return false; + + LLVM_DEBUG(dbgs() << "Folded " << MI << " into "); + SrcOp->setReg(SubstOp->getReg()); + LLVM_DEBUG(dbgs() << MI); + return true; + }; + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (!Src1->isIdenticalTo(*Src0)) { - std::optional Src1Imm = getImmOrMaterializedImm(*Src1); - if (!Src1Imm) - return false; + // Try to fold with not-equal comparisons + unsigned NECmpOpcodes[] = { + AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64, + AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64, + AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_I16_e64, + AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_LG_F16_e64}; std::optional Src0Imm = getImmOrMaterializedImm(*Src0); - if (!Src0Imm || *Src0Imm != *Src1Imm) + if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1)) + return true; + + // Try to fold with equal comparisons + unsigned EQCmpOpcodes[] = { + AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_I32_e64, + AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U16_e64, + AMDGPU::V_CMP_EQ_I16_e64, AMDGPU::V_CMP_EQ_F16_e64}; + + std::optional Src1Imm = getImmOrMaterializedImm(*Src1); + if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0)) + return true; + + if (!Src0Imm || !Src1Imm || *Src0Imm != *Src1Imm) return false; } diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll new file mode 100644 index 0000000000000..8ae4ff7da31f8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll @@ -0,0 +1,597 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10 + +define float @f32_oeq_v_i(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000 + %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1 + ret float %select +} + +define float @f32_oeq_i_v(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg + %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1 + ret float %select +} + +define float @f32_one_v_i(float %arg, float %arg1) { +; GFX9-LABEL: f32_one_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_one_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp one float %arg, 0x3FCF5C2900000000 + %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000 + ret float %select +} + +define float @f32_one_i_v(float %arg, float %arg1) { +; GFX9-LABEL: f32_one_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148 +; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_one_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp one float %arg, 0x3FCF5C2900000000 + %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000 + ret float %select +} + +define half @f16_oeq_v_i(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x5140 +; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half %arg, 42.0 + %select = select i1 %fcmp, half 42.0, half %arg1 + ret half %select +} + +define half @f16_oeq_i_v(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x5140 +; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half 42.0, %arg + %select = select i1 %fcmp, half 42.0, half %arg1 + ret half %select +} + +define half @f16_one_v_i(half %arg, half %arg1) { +; GFX9-LABEL: f16_one_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x5140 +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_one_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp one half %arg, 42.0 + %select = select i1 %fcmp, half %arg1, half 42.0 + ret half %select +} + +define half @f16_one_i_v(half %arg, half %arg1) { +; GFX9-LABEL: f16_one_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x5140 +; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_one_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp one half %arg, 42.0 + %select = select i1 %fcmp, half %arg1, half 42.0 + ret half %select +} + +define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) { +; GFX9-LABEL: i32_eq_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x67932 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i32_eq_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp eq i32 %arg, 424242 + %select = select i1 %icmp, i32 424242, i32 %arg1 + ret i32 %select +} + +define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) { +; GFX9-LABEL: i32_eq_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x67932 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i32_eq_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp eq i32 424242, %arg + %select = select i1 %icmp, i32 424242, i32 %arg1 + ret i32 %select +} + +define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) { +; GFX9-LABEL: i32_ne_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x67932 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i32_ne_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp ne i32 %arg, 424242 + %select = select i1 %icmp, i32 %arg1, i32 424242 + ret i32 %select +} + +define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) { +; GFX9-LABEL: i32_ne_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x67932 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i32_ne_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp ne i32 424242, %arg + %select = select i1 %icmp, i32 %arg1, i32 424242 + ret i32 %select +} + +define i16 @i16_eq_v_i(i16 %arg, i16 %arg1) { +; GFX9-LABEL: i16_eq_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x1092 +; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_eq_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp eq i16 %arg, 4242 + %select = select i1 %icmp, i16 4242, i16 %arg1 + ret i16 %select +} + +define i16 @i16_eq_i_v(i16 %arg, i16 %arg1) { +; GFX9-LABEL: i16_eq_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x1092 +; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_eq_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp eq i16 4242, %arg + %select = select i1 %icmp, i16 4242, i16 %arg1 + ret i16 %select +} + +define i16 @i16_ne_v_i(i16 %arg, i16 %arg1) { +; GFX9-LABEL: i16_ne_v_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x1092 +; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_ne_v_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp ne i16 %arg, 4242 + %select = select i1 %icmp, i16 %arg1, i16 4242 + ret i16 %select +} + +define i16 @i16_ne_i_v(i16 %arg, i16 %arg1) { +; GFX9-LABEL: i16_ne_i_v: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x1092 +; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_ne_i_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %icmp = icmp ne i16 4242, %arg + %select = select i1 %icmp, i16 %arg1, i16 4242 + ret i16 %select +} + +define float @f32_oeq_z_i(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_z_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_z_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float %arg, 0.000000e+00 + %select = select i1 %fcmp, float 0.000000e+00, float %arg1 + ret float %select +} + +define float @f32_oeq_z_z(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_z_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_z_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float %arg, 0.000000e+00 + %select = select i1 %fcmp, float %arg, float %arg1 + ret float %select +} + +define half @f16_oeq_z_i(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_z_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_z_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half %arg, 0.000000e+00 + %select = select i1 %fcmp, half 0.000000e+00, half %arg1 + ret half %select +} + +define half @f16_oeq_z_z(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_z_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_f16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_z_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half %arg, 0.000000e+00 + %select = select i1 %fcmp, half %arg, half %arg1 + ret half %select +} + +define float @f32_oeq_negz_i(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_negz_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, 1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_negz_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x80000000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x80000000, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float %arg, -0.000000e+00 + %select = select i1 %fcmp, float -0.000000e+00, float %arg1 + ret float %select +} + +define float @f32_oeq_negz_z(float %arg, float %arg1) { +; GFX9-LABEL: f32_oeq_negz_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, 1 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f32_oeq_negz_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x80000000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq float %arg, -0.000000e+00 + %select = select i1 %fcmp, float %arg, float %arg1 + ret float %select +} + +define half @f16_oeq_negz_i(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_negz_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x8000 +; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_negz_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x8000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x8000, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half %arg, -0.000000e+00 + %select = select i1 %fcmp, half -0.000000e+00, half %arg1 + ret half %select +} + +define half @f16_oeq_negz_z(half %arg, half %arg1) { +; GFX9-LABEL: f16_oeq_negz_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x8000 +; GFX9-NEXT: v_cmp_eq_f16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f16_oeq_negz_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, 0x8000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq half %arg, -0.000000e+00 + %select = select i1 %fcmp, half %arg, half %arg1 + ret half %select +} + +define double @f64_oeq_z_i(double %arg, double %arg1) { +; GFX9-LABEL: f64_oeq_z_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_neq_f64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f64_oeq_z_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq double %arg, 0.000000e+00 + %select = select i1 %fcmp, double 0.000000e+00, double %arg1 + ret double %select +} + +define double @f64_oeq_z_z(double %arg, double %arg1) { +; GFX9-LABEL: f64_oeq_z_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f64_oeq_z_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq double %arg, 0.000000e+00 + %select = select i1 %fcmp, double %arg, double %arg1 + ret double %select +} + +define double @f64_oeq_negz_i(double %arg, double %arg1) { +; GFX9-LABEL: f64_oeq_negz_i: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_brev_b32 s5, 1 +; GFX9-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f64_oeq_negz_i: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x80000000, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x80000000, v3, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq double %arg, -0.000000e+00 + %select = select i1 %fcmp, double -0.000000e+00, double %arg1 + ret double %select +} + +define double @f64_oeq_negz_z(double %arg, double %arg1) { +; GFX9-LABEL: f64_oeq_negz_z: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_brev_b32 s5, 1 +; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: f64_oeq_negz_z: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0x80000000, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + %fcmp = fcmp oeq double %arg, -0.000000e+00 + %select = select i1 %fcmp, double %arg, double %arg1 + ret double %select +}