Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 69 additions & 4 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1411,15 +1411,80 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
return false;

// Try to find optimized Y == Const ? Const : Z. If Const can't be directly
// encoded in the cndmask, try to reuse a register already holding the Const
// value from the comparison instruction.
auto tryFoldCndMaskCmp =
[&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
// We'll try to process only register operands with known values.
if (!SrcImm || !SrcOp->isReg())
return false;

// Find the predicate of the cndmask instruction.
MachineOperand *PredOp = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (!PredOp || !PredOp->isReg())
return false;

MachineInstr *PredI = MRI->getVRegDef(PredOp->getReg());
if (!PredI || !PredI->isCompare())
return false;

unsigned CmpOpc = PredI->getOpcode();

if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
return false;

// Check if the immediate value of the source operand matches the immediate
// value of either the first or second operand of the comparison
// instruction.
MachineOperand *SubstOp = nullptr;
std::optional<int64_t> CmpValImm = getImmOrMaterializedImm(
*TII->getNamedOperand(*PredI, AMDGPU::OpName::src0));
if (CmpValImm && *CmpValImm == *SrcImm) {
SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src1);
} else {
CmpValImm = getImmOrMaterializedImm(
*TII->getNamedOperand(*PredI, AMDGPU::OpName::src1));
if (CmpValImm && *CmpValImm == *SrcImm) {
SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src0);
} else {
return false;
}
}

if (!SubstOp || !SubstOp->isReg())
return false;

LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
SrcOp->setReg(SubstOp->getReg());
LLVM_DEBUG(dbgs() << MI);
return true;
};

MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
if (!Src1Imm)
return false;
// Try to fold with not-equal comparisons
unsigned NECmpOpcodes[4] = {
AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};

std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
if (!Src0Imm || *Src0Imm != *Src1Imm)
if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
return true;

// Try to fold with equal comparisons
unsigned EQCmpOpcodes[4] = {
AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};

std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))
return true;

if (!Src0Imm || !Src1Imm || *Src0Imm != *Src1Imm)
return false;
}

Expand Down
171 changes: 171 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about with inline immediate? It shouldn't be any better?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the cases the immediate is already inlined into v_cndmask if it was possible. E.g. in the test they are successfully inlined for gfx1030 before the moment when the patch processes v_cndmask. In such cases it just skips the instruction.

define float @f32_oeq_v_i(float %arg, float %arg1) {
; GFX9-LABEL: f32_oeq_v_i:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: f32_oeq_v_i:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000
%select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
ret float %select
}

define float @f32_oeq_i_v(float %arg, float %arg1) {
; GFX9-LABEL: f32_oeq_i_v:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: f32_oeq_i_v:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg
%select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
ret float %select
}

define float @f32_one_v_i(float %arg, float %arg1) {
; GFX9-LABEL: f32_one_v_i:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: f32_one_v_i:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%fcmp = fcmp one float %arg, 0x3FCF5C2900000000
%select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
ret float %select
}

define float @f32_one_i_v(float %arg, float %arg1) {
; GFX9-LABEL: f32_one_i_v:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: f32_one_i_v:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%fcmp = fcmp one float %arg, 0x3FCF5C2900000000
%select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
ret float %select
}

define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
; GFX9-LABEL: i32_eq_v_i:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x67932
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: i32_eq_v_i:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%icmp = icmp eq i32 %arg, 424242
%select = select i1 %icmp, i32 424242, i32 %arg1
ret i32 %select
}

define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) {
; GFX9-LABEL: i32_eq_i_v:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x67932
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: i32_eq_i_v:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%icmp = icmp eq i32 424242, %arg
%select = select i1 %icmp, i32 424242, i32 %arg1
ret i32 %select
}

define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) {
; GFX9-LABEL: i32_ne_v_i:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x67932
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: i32_ne_v_i:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%icmp = icmp ne i32 %arg, 424242
%select = select i1 %icmp, i32 %arg1, i32 424242
ret i32 %select
}

define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) {
; GFX9-LABEL: i32_ne_i_v:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x67932
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: i32_ne_i_v:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
bb:
%icmp = icmp ne i32 424242, %arg
%select = select i1 %icmp, i32 %arg1, i32 424242
ret i32 %select
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test half, i16, and bfloat cases. Plus 64-bit

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test half, i16, and bfloat cases. Plus 64-bit

I added half and i16 types. For bloat case we'll get no advantage here, since the imm is stored in two registers in any case: one (shifted left) for compare and second (original) for cndmask, like:

v_lshlrev_b32_e32 v2, 16, v0
s_mov_b32 s4, 0x42420000
v_cmp_eq_f32_e32 vcc, s4, v2
v_cndmask_b32_e32 v0, v1, v0, vcc

For the 64-bit types this folding doesn't work yet, since they are lowered into different pattern cmp/cndmask with pairs of registers (with REG_SEQUENCEs). I would implement it incrementally in additional patch.

Loading