Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 219 additions & 0 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ class SIShrinkInstructions {
unsigned SubReg) const;
bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
unsigned SubReg) const;
Register trySwapCndOperands(MachineInstr &MI) const;
bool shouldSwapCndOperands(Register Reg,
std::vector<MachineInstr *> &UsesToProcess) const;
unsigned getInverseCompareOpcode(MachineInstr &MI) const;
TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
unsigned I) const;
void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
Expand Down Expand Up @@ -830,6 +834,215 @@ bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
return true;
}

unsigned SIShrinkInstructions::getInverseCompareOpcode(MachineInstr &MI) const {
switch (MI.getOpcode()) {
// int 32
case AMDGPU::V_CMP_EQ_I32_e64:
return AMDGPU::V_CMP_NE_I32_e64;
case AMDGPU::V_CMP_NE_I32_e64:
return AMDGPU::V_CMP_EQ_I32_e64;
case AMDGPU::V_CMP_GE_I32_e64:
return AMDGPU::V_CMP_LT_I32_e64;
case AMDGPU::V_CMP_LE_I32_e64:
return AMDGPU::V_CMP_GT_I32_e64;
case AMDGPU::V_CMP_GT_I32_e64:
return AMDGPU::V_CMP_LE_I32_e64;
case AMDGPU::V_CMP_LT_I32_e64:
return AMDGPU::V_CMP_GE_I32_e64;
// int 64
case AMDGPU::V_CMP_EQ_I64_e64:
return AMDGPU::V_CMP_NE_I64_e64;
case AMDGPU::V_CMP_NE_I64_e64:
return AMDGPU::V_CMP_EQ_I64_e64;
case AMDGPU::V_CMP_GE_I64_e64:
return AMDGPU::V_CMP_LT_I64_e64;
case AMDGPU::V_CMP_LE_I64_e64:
return AMDGPU::V_CMP_GT_I64_e64;
case AMDGPU::V_CMP_GT_I64_e64:
return AMDGPU::V_CMP_LE_I64_e64;
case AMDGPU::V_CMP_LT_I64_e64:
return AMDGPU::V_CMP_GE_I64_e64;
// unsigned 32
case AMDGPU::V_CMP_EQ_U32_e64:
return AMDGPU::V_CMP_NE_U32_e64;
case AMDGPU::V_CMP_NE_U32_e64:
return AMDGPU::V_CMP_EQ_U32_e64;
case AMDGPU::V_CMP_GE_U32_e64:
return AMDGPU::V_CMP_LT_U32_e64;
case AMDGPU::V_CMP_LE_U32_e64:
return AMDGPU::V_CMP_GT_U32_e64;
case AMDGPU::V_CMP_GT_U32_e64:
return AMDGPU::V_CMP_LE_U32_e64;
case AMDGPU::V_CMP_LT_U32_e64:
return AMDGPU::V_CMP_GE_U32_e64;
// unsigned 64
case AMDGPU::V_CMP_EQ_U64_e64:
return AMDGPU::V_CMP_NE_U64_e64;
case AMDGPU::V_CMP_NE_U64_e64:
return AMDGPU::V_CMP_EQ_U64_e64;
case AMDGPU::V_CMP_GE_U64_e64:
return AMDGPU::V_CMP_LT_U64_e64;
case AMDGPU::V_CMP_LE_U64_e64:
return AMDGPU::V_CMP_GT_U64_e64;
case AMDGPU::V_CMP_GT_U64_e64:
return AMDGPU::V_CMP_LE_U64_e64;
case AMDGPU::V_CMP_LT_U64_e64:
return AMDGPU::V_CMP_GE_U64_e64;
// float 32
case AMDGPU::V_CMP_EQ_F32_e64:
return AMDGPU::V_CMP_NEQ_F32_e64;
case AMDGPU::V_CMP_NEQ_F32_e64:
return AMDGPU::V_CMP_EQ_F32_e64;
case AMDGPU::V_CMP_GE_F32_e64:
return AMDGPU::V_CMP_NGE_F32_e64;
case AMDGPU::V_CMP_NGE_F32_e64:
return AMDGPU::V_CMP_GE_F32_e64;
case AMDGPU::V_CMP_LE_F32_e64:
return AMDGPU::V_CMP_NLE_F32_e64;
case AMDGPU::V_CMP_NLE_F32_e64:
return AMDGPU::V_CMP_LE_F32_e64;
case AMDGPU::V_CMP_GT_F32_e64:
return AMDGPU::V_CMP_NGT_F32_e64;
case AMDGPU::V_CMP_NGT_F32_e64:
return AMDGPU::V_CMP_GT_F32_e64;
case AMDGPU::V_CMP_LT_F32_e64:
return AMDGPU::V_CMP_NLT_F32_e64;
case AMDGPU::V_CMP_NLT_F32_e64:
return AMDGPU::V_CMP_LT_F32_e64;
case AMDGPU::V_CMP_LG_F32_e64:
return AMDGPU::V_CMP_NLG_F32_e64;
case AMDGPU::V_CMP_NLG_F32_e64:
return AMDGPU::V_CMP_LG_F32_e64;
case AMDGPU::V_CMP_O_F32_e64:
return AMDGPU::V_CMP_U_F32_e64;
case AMDGPU::V_CMP_U_F32_e64:
return AMDGPU::V_CMP_O_F32_e64;
// float 64
case AMDGPU::V_CMP_EQ_F64_e64:
return AMDGPU::V_CMP_NEQ_F64_e64;
case AMDGPU::V_CMP_NEQ_F64_e64:
return AMDGPU::V_CMP_EQ_F64_e64;
case AMDGPU::V_CMP_GE_F64_e64:
return AMDGPU::V_CMP_NGE_F64_e64;
case AMDGPU::V_CMP_NGE_F64_e64:
return AMDGPU::V_CMP_GE_F64_e64;
case AMDGPU::V_CMP_LE_F64_e64:
return AMDGPU::V_CMP_NLE_F64_e64;
case AMDGPU::V_CMP_NLE_F64_e64:
return AMDGPU::V_CMP_LE_F64_e64;
case AMDGPU::V_CMP_GT_F64_e64:
return AMDGPU::V_CMP_NGT_F64_e64;
case AMDGPU::V_CMP_NGT_F64_e64:
return AMDGPU::V_CMP_GT_F32_e64;
case AMDGPU::V_CMP_LT_F64_e64:
return AMDGPU::V_CMP_NLT_F64_e64;
case AMDGPU::V_CMP_NLT_F64_e64:
return AMDGPU::V_CMP_LT_F64_e64;
case AMDGPU::V_CMP_LG_F64_e64:
return AMDGPU::V_CMP_NLG_F64_e64;
case AMDGPU::V_CMP_NLG_F64_e64:
return AMDGPU::V_CMP_LG_F64_e64;
case AMDGPU::V_CMP_O_F64_e64:
return AMDGPU::V_CMP_U_F64_e64;
case AMDGPU::V_CMP_U_F64_e64:
return AMDGPU::V_CMP_O_F64_e64;
default:
return 0;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could also handle cmp_class, but we probably should have handled this earlier

}

bool SIShrinkInstructions::shouldSwapCndOperands(
Register Reg, std::vector<MachineInstr *> &UsesToProcess) const {
auto AllUses = MRI->use_nodbg_instructions(Reg);
int InstsToSwap = 0;

for (auto &UseInst : AllUses) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should avoid looking at all uses, should perform the fold from the use cndmask

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decision to swap operands is done either on all instructions or none, since the original cmp instruction that defines vcc is also changed. You can not really avoid looking at all uses.

Preforming the fold from the use cndmask is inconvenient because you would have to track whether the specific vcc value was already analyzed.

if (UseInst.getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return false;

UsesToProcess.push_back(&UseInst);

MachineOperand &Src0 = UseInst.getOperand(2);
MachineOperand &Src1 = UseInst.getOperand(4);

//if instruction has source modifiers it cannot be converted to VOP2
if (UseInst.getOperand(1).getImm() != SISrcMods::NONE ||
UseInst.getOperand(3).getImm() != SISrcMods::NONE)
continue;

bool Src0IsVGPR = Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg());
bool Src1IsVGPR = Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: IsSrc0VGPR and IsSrc1VGPR seems better.


//Src1 always has to be VGPR in VOP2
if (!Src0IsVGPR && Src1IsVGPR)
InstsToSwap--;
else if (Src0IsVGPR && !Src1IsVGPR)
InstsToSwap++;
}
return InstsToSwap > 0;
}

static void swapCndOperands(MachineInstr &MI) {
MachineOperand &Op2 = MI.getOperand(2);
MachineOperand Op4 = MI.getOperand(4);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have an assertion to ensure MI contains at least 5 operands before getOperand(4)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, that would just clutter the code. getOperand already has the assertion in its implementation.


if (Op2.isReg()) {
MI.getOperand(4).ChangeToRegister(
Op2.getReg(), Op2.isDef(), Op2.isImplicit(), Op2.isKill(), Op2.isDead(),
Op2.isUndef(), Op2.isDebug());
MI.getOperand(4).setSubReg(Op2.getSubReg());
} else if (Op2.isImm()) {
MI.getOperand(4).ChangeToImmediate(Op2.getImm());
}

if (Op4.isReg()) {
Op2.ChangeToRegister(Op4.getReg(), Op4.isDef(), Op4.isImplicit(),
Op4.isKill(), Op4.isDead(), Op4.isUndef(),
Op4.isDebug());
Op2.setSubReg(Op4.getSubReg());
} else if (Op4.isImm()) {
Op2.ChangeToImmediate(Op4.getImm());
}

auto Op1Imm = MI.getOperand(1).getImm();
auto Op3Imm = MI.getOperand(3).getImm();
MI.getOperand(1).setImm(Op3Imm);
MI.getOperand(3).setImm(Op1Imm);
}

Register SIShrinkInstructions::trySwapCndOperands(MachineInstr &MI) const {
Register Reg = MI.getOperand(0).getReg();

unsigned Opcode = getInverseCompareOpcode(MI);
std::vector<MachineInstr *> UsesToProcess;
if (!Opcode ||
!SIShrinkInstructions::shouldSwapCndOperands(Reg, UsesToProcess))
return Reg;

auto DL = MI.getDebugLoc();
Register NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));

MachineInstrBuilder InverseCompare =
BuildMI(*MI.getParent(), MI, DL, TII->get(Opcode), NewVCC);
InverseCompare->setFlags(MI.getFlags());

unsigned OpNum = MI.getNumExplicitOperands();
for (unsigned Idx = 1; Idx < OpNum; Idx++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)

the number of operands for cmp instructions is not fixed

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)

i can do that but then i have to check for number of operands explicitly

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is fixed, it's just has source modifiers or does not have source modifiers (I guess cmp_class complicates it slightly)

MachineOperand &Op = MI.getOperand(Idx);
InverseCompare.add(Op);
if (Op.isReg() && Op.isKill())
InverseCompare->getOperand(Idx).setIsKill(false);
Comment on lines +1033 to +1034
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can just do a clearKillFlags

}

for (auto Use : UsesToProcess) {
swapCndOperands(*Use);
}

MRI->replaceRegWith(Reg, NewVCC);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be making a local replacement

MI.eraseFromParent();
return NewVCC;
}

bool SIShrinkInstructions::run(MachineFunction &MF) {

this->MF = &MF;
Expand All @@ -840,6 +1053,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {

unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;

<<<<<<< HEAD
=======
std::vector<unsigned> I1Defs;

>>>>>>> 1336afc5defe (update tests)
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {

Expand Down Expand Up @@ -997,6 +1215,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
// dst.
Register DstReg = Op0.getReg();
if (DstReg.isVirtual()) {
DstReg = trySwapCndOperands(MI);
// VOPC instructions can only write to the VCC register. We can't
// force them to use VCC here, because this is only one register and
// cannot deal with sequences which would require multiple copies of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ define amdgpu_ps float @select_vcc_s_s(float %cmp0, float %cmp1, float inreg %sr
; GFX10PLUS-LABEL: select_vcc_s_s:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s3
; GFX10PLUS-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, s2, vcc_lo
; GFX10PLUS-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, s2, v2, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%cmp = fcmp oeq float %cmp0, %cmp1
%result = select i1 %cmp, float %src0, float %src1
Expand Down
Loading