Skip to content

Commit 1fa4482

Browse files
committed
AMDGPU: Factor agpr reg_sequence folding into a function
1 parent a316539 commit 1fa4482

File tree

1 file changed

+90
-78
lines changed

1 file changed

+90
-78
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 90 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
128128
bool tryFoldCndMask(MachineInstr &MI) const;
129129
bool tryFoldZeroHighBits(MachineInstr &MI) const;
130130
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
131+
132+
bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
131133
bool tryFoldFoldableCopy(MachineInstr &MI,
132134
MachineOperand *&CurrentKnownM0Val) const;
133135

@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
10121014
UseMI->getOperand(0).getReg().isVirtual() &&
10131015
!UseMI->getOperand(1).getSubReg()) {
10141016
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1015-
unsigned Size = TII->getOpSize(*UseMI, 1);
10161017
Register UseReg = OpToFold.getReg();
10171018
UseMI->getOperand(1).setReg(UseReg);
10181019
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
@@ -1022,83 +1023,8 @@ void SIFoldOperandsImpl::foldOperand(
10221023

10231024
// Remove kill flags as kills may now be out of order with uses.
10241025
MRI->clearKillFlags(OpToFold.getReg());
1025-
1026-
// That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1027-
// can only accept VGPR or inline immediate. Recreate a reg_sequence with
1028-
// its initializers right here, so we will rematerialize immediates and
1029-
// avoid copies via different reg classes.
1030-
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1031-
if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1032-
getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1033-
const DebugLoc &DL = UseMI->getDebugLoc();
1034-
MachineBasicBlock &MBB = *UseMI->getParent();
1035-
1036-
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1037-
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
1038-
UseMI->removeOperand(I);
1039-
1040-
MachineInstrBuilder B(*MBB.getParent(), UseMI);
1041-
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1042-
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
1043-
for (unsigned I = 0; I < Size / 4; ++I) {
1044-
MachineOperand *Def = Defs[I].first;
1045-
TargetInstrInfo::RegSubRegPair CopyToVGPR;
1046-
if (Def->isImm() &&
1047-
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048-
int64_t Imm = Def->getImm();
1049-
1050-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1051-
BuildMI(MBB, UseMI, DL,
1052-
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
1053-
B.addReg(Tmp);
1054-
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1055-
auto Src = getRegSubRegPair(*Def);
1056-
Def->setIsKill(false);
1057-
if (!SeenAGPRs.insert(Src)) {
1058-
// We cannot build a reg_sequence out of the same registers, they
1059-
// must be copied. Better do it here before copyPhysReg() created
1060-
// several reads to do the AGPR->VGPR->AGPR copy.
1061-
CopyToVGPR = Src;
1062-
} else {
1063-
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
1064-
Src.SubReg);
1065-
}
1066-
} else {
1067-
assert(Def->isReg());
1068-
Def->setIsKill(false);
1069-
auto Src = getRegSubRegPair(*Def);
1070-
1071-
// Direct copy from SGPR to AGPR is not possible. To avoid creation
1072-
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1073-
// create a copy here and track if we already have such a copy.
1074-
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1075-
CopyToVGPR = Src;
1076-
} else {
1077-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1078-
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1079-
B.addReg(Tmp);
1080-
}
1081-
}
1082-
1083-
if (CopyToVGPR.Reg) {
1084-
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
1085-
Register &Vgpr = It->second;
1086-
if (Inserted) {
1087-
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1088-
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1089-
}
1090-
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1091-
BuildMI(MBB, UseMI, DL,
1092-
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
1093-
B.addReg(Tmp);
1094-
}
1095-
1096-
B.addImm(Defs[I].second);
1097-
}
1098-
LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
1099-
}
1100-
1101-
return;
1026+
if (foldCopyToAGPRRegSequence(UseMI))
1027+
return;
11021028
}
11031029

11041030
unsigned UseOpc = UseMI->getOpcode();
@@ -1558,6 +1484,92 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
15581484
return true;
15591485
}
15601486

1487+
/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1488+
/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1489+
bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1490+
// It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1491+
// only accept VGPR or inline immediate. Recreate a reg_sequence with its
1492+
// initializers right here, so we will rematerialize immediates and avoid
1493+
// copies via different reg classes.
1494+
if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
1495+
return false;
1496+
unsigned Size = TII->getOpSize(*CopyMI, 1);
1497+
if (Size <= 4)
1498+
return false;
1499+
1500+
Register UseReg = CopyMI->getOperand(1).getReg();
1501+
SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
1502+
if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
1503+
return false;
1504+
1505+
const DebugLoc &DL = CopyMI->getDebugLoc();
1506+
MachineBasicBlock &MBB = *CopyMI->getParent();
1507+
1508+
CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1509+
for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1510+
CopyMI->removeOperand(I);
1511+
1512+
MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1513+
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1514+
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
1515+
for (unsigned I = 0; I < Size / 4; ++I) {
1516+
MachineOperand *Def = Defs[I].first;
1517+
TargetInstrInfo::RegSubRegPair CopyToVGPR;
1518+
if (Def->isImm() &&
1519+
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1520+
int64_t Imm = Def->getImm();
1521+
1522+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1523+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1524+
.addImm(Imm);
1525+
B.addReg(Tmp);
1526+
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1527+
auto Src = getRegSubRegPair(*Def);
1528+
Def->setIsKill(false);
1529+
if (!SeenAGPRs.insert(Src)) {
1530+
// We cannot build a reg_sequence out of the same registers, they
1531+
// must be copied. Better do it here before copyPhysReg() created
1532+
// several reads to do the AGPR->VGPR->AGPR copy.
1533+
CopyToVGPR = Src;
1534+
} else {
1535+
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
1536+
}
1537+
} else {
1538+
assert(Def->isReg());
1539+
Def->setIsKill(false);
1540+
auto Src = getRegSubRegPair(*Def);
1541+
1542+
// Direct copy from SGPR to AGPR is not possible. To avoid creation
1543+
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1544+
// create a copy here and track if we already have such a copy.
1545+
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1546+
CopyToVGPR = Src;
1547+
} else {
1548+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1549+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1550+
B.addReg(Tmp);
1551+
}
1552+
}
1553+
1554+
if (CopyToVGPR.Reg) {
1555+
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
1556+
Register &Vgpr = It->second;
1557+
if (Inserted) {
1558+
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1559+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1560+
}
1561+
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1562+
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1563+
.addReg(Vgpr);
1564+
B.addReg(Tmp);
1565+
}
1566+
1567+
B.addImm(Defs[I].second);
1568+
}
1569+
LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1570+
return true;
1571+
}
1572+
15611573
bool SIFoldOperandsImpl::tryFoldFoldableCopy(
15621574
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
15631575
Register DstReg = MI.getOperand(0).getReg();

0 commit comments

Comments
 (0)