Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
390fd92
[AMDGPU] SIPeepholeSDWA: Reject V_CNDMASK_B32_e64 instead of V_CNDMAS…
frederik-h Mar 27, 2025
c9b7002
[AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64
frederik-h Apr 4, 2025
16e4118
Change computeRegisterLiveness use
frederik-h Apr 30, 2025
c344d14
Stop moving carry-in def instruction
frederik-h Apr 30, 2025
c100594
Handle undef carry-in operand
frederik-h Apr 30, 2025
b2a5bab
Remove extra newline from debug output
frederik-h Apr 30, 2025
65d7dd1
Rename test files to indicate the different ISAs being tested
frederik-h Apr 30, 2025
b0e665e
Use COPY instead of V_CMP_EQ for copy to VCC
frederik-h Apr 30, 2025
fc50f87
Handle wave32
frederik-h Apr 30, 2025
f05ec81
Rename sdwa-peephole-cndmask-gfx{9,10} tests
frederik-h Apr 30, 2025
3b2dc23
Unify test names
frederik-h Apr 30, 2025
f807526
Update llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
frederik-h May 2, 2025
9bea2ed
clang-format changes
frederik-h May 2, 2025
af365ee
Rename convertToImplicitVcc and move CarryDef up
frederik-h May 2, 2025
3c8bc54
Extend tests
frederik-h May 2, 2025
952881f
clang-format changes
frederik-h May 2, 2025
5c4cae5
Change test prefix
frederik-h May 2, 2025
9e406a9
Apply suggestions from code review
frederik-h May 2, 2025
a8f5dc8
Adjusts tests
frederik-h May 2, 2025
e943523
Make sure that V_CND_MASK gets handled
frederik-h May 2, 2025
d027b65
Change tests to avoid the impression that the carry-in def will be re…
frederik-h May 2, 2025
135d3a0
Always copy from carry-in operand to VCC
frederik-h May 5, 2025
8d7825a
Apply suggestions from code review
frederik-h May 5, 2025
5a98da9
Apply suggestions from code review
frederik-h May 5, 2025
b79a9ce
Adjust tests
frederik-h May 5, 2025
1975582
Rename VOP2 test file and remove "-vop3" from other test names
frederik-h May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 90 additions & 7 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class SIPeepholeSDWA {
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
void convertToImplicitVcc(MachineInstr &MI, const GCNSubtarget &ST) const;
MachineInstr *createSDWAVersion(MachineInstr &MI);
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
Expand Down Expand Up @@ -1061,6 +1062,79 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}

static unsigned getVCmpEqOpcode(unsigned Bits) {
if (Bits == 64)
return AMDGPU::V_CMP_EQ_U64_e64;
if (Bits == 32)
return AMDGPU::V_CMP_EQ_U32_e64;
if (Bits == 16)
return AMDGPU::V_CMP_EQ_U16_e64;

llvm_unreachable("Unexpected register bit width.");
};

/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
/// operand into the corresponding VOP2 form which expects the
/// argument in VCC. To this end, either try to change the definition
/// of the carry-in operand to write to VCC or add an instruction that
/// copies from the carry-in to VCC. The conversion will only be
/// applied if \p MI can be shrunk to VOP2 and if VCC can be proven to
/// be dead before \p MI.
void SIPeepholeSDWA::convertToImplicitVcc(MachineInstr &MI,
const GCNSubtarget &ST) const {
assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);

MCRegister Vcc = TRI->getVCC();
// FIXME Conversion introduces implicit vcc_hi use
if (Vcc == AMDGPU::VCC_LO)
return;

LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
if (!TII->canShrink(MI, *MRI)) {
LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
return;
}

const MachineOperand &CarryIn =
*TII->getNamedOperand(MI, AMDGPU::OpName::src2);

// Make sure VCC or its subregs are dead before MI.
MachineBasicBlock &MBB = *MI.getParent();
auto Liveness = MBB.computeRegisterLiveness(TRI, Vcc, MI, 100);
if (Liveness != MachineBasicBlock::LQR_Dead) {
LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction.\n");
return;
}
// Change destination of compare instruction to VCC
// or copy to VCC if carry-in is not a compare inst.
Register CarryReg = CarryIn.getReg();
MachineInstr &CarryDef = *MRI->getVRegDef(CarryReg);

if (CarryDef.isCompare() && TII->isVOP3(CarryDef) &&
MRI->hasOneUse(CarryIn.getReg())) {
CarryDef.substituteRegister(CarryIn.getReg(), Vcc, 0, *TRI);
CarryDef.moveBefore(&MI);
} else {
// Add write: VCC[lanedId] <- (CarryIn[laneId] == 1)
const TargetRegisterClass *Class =
TRI->getRegClassForOperandReg(*MRI, CarryIn);
unsigned RegSize = Class->MC->getSizeInBits();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(getVCmpEqOpcode(RegSize)))
.addReg(Vcc, RegState::Define)
.addImm(1)
.add(CarryIn);
}

auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
.setMIFlags(MI.getFlags());
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted << '\n');
MI.eraseFromParent();
}

namespace {
bool isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST,
Expand All @@ -1070,6 +1144,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
if (TII->isSDWA(Opc))
return true;

// Can only be handled after ealier conversion to
// AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
if (Opc == AMDGPU::V_CNDMASK_B32_e64)
return false;

// Check if this instruction has opcode that supports SDWA
if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);
Expand Down Expand Up @@ -1108,10 +1187,6 @@ bool isConvertibleToSDWA(MachineInstr &MI,
if (TII->pseudoToMCOpcode(Opc) == -1)
return false;

// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;

if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
if (!Src0->isReg() && !Src0->isImm())
return false;
Expand Down Expand Up @@ -1384,10 +1459,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
if (!PotentialMI)
continue;

switch (PotentialMI->getOpcode()) {
case AMDGPU::V_ADD_CO_U32_e64:
case AMDGPU::V_SUB_CO_U32_e64:
pseudoOpConvertToVOP2(*PotentialMI, ST);
break;
case AMDGPU::V_CNDMASK_B32_e64:
convertToImplicitVcc(*PotentialMI, ST);
break;
};
}
SDWAOperands.clear();

Expand Down
Loading