Skip to content

Commit c00f816

Browse files
committed
16bit sgpr folding
1 parent e9fc768 commit c00f816

File tree

1 file changed

+90
-12
lines changed

1 file changed

+90
-12
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
#include "AMDGPU.h"
1313
#include "GCNSubtarget.h"
1414
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15+
#include "SIInstrInfo.h"
1516
#include "SIMachineFunctionInfo.h"
17+
#include "SIRegisterInfo.h"
1618
#include "llvm/ADT/DepthFirstIterator.h"
19+
#include "llvm/CodeGen/MachineFunction.h"
1720
#include "llvm/CodeGen/MachineFunctionPass.h"
1821
#include "llvm/CodeGen/MachineOperand.h"
1922

@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
576579
}
577580

578581
MachineOperand *New = Fold.OpToFold;
582+
// TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
583+
// Rework once the VS_16 register class is updated to include proper
584+
// 16-bit SGPRs instead of 32-bit ones.
585+
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
586+
Old.setSubReg(AMDGPU::NoSubRegister);
579587
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
580588
Old.setIsUndef(New->isUndef());
581589
return true;
@@ -947,9 +955,14 @@ void SIFoldOperandsImpl::foldOperand(
947955
return;
948956

949957
// FIXME: Fold operands with subregs.
950-
if (UseOp->isReg() && OpToFold.isReg() &&
951-
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
952-
return;
958+
if (UseOp->isReg() && OpToFold.isReg()) {
959+
if (UseOp->isImplicit())
960+
return;
961+
// Allow folding from SGPRs to 16-bit VGPRs.
962+
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
963+
UseOp->getSubReg() != AMDGPU::lo16)
964+
return;
965+
}
953966

954967
// Special case for REG_SEQUENCE: We can't fold literals into
955968
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1030,14 +1043,20 @@ void SIFoldOperandsImpl::foldOperand(
10301043
return;
10311044

10321045
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1033-
if (!DestReg.isPhysical()) {
1034-
if (DestRC == &AMDGPU::AGPR_32RegClass &&
1035-
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1036-
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
1037-
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1038-
CopiesToReplace.push_back(UseMI);
1039-
return;
1040-
}
1046+
if (DestRC == &AMDGPU::AGPR_32RegClass &&
1047+
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1048+
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
1049+
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1050+
CopiesToReplace.push_back(UseMI);
1051+
return;
1052+
}
1053+
1054+
// Allow immediates COPYd into sgpr_lo16 to be further folded while
1055+
// still being legal if not further folded
1056+
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
1057+
assert(ST->useRealTrue16Insts());
1058+
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
1059+
DestRC = &AMDGPU::SGPR_32RegClass;
10411060
}
10421061

10431062
// In order to fold immediates into copies, we need to change the
@@ -1073,9 +1092,43 @@ void SIFoldOperandsImpl::foldOperand(
10731092
UseMI->getOperand(0).getReg().isVirtual() &&
10741093
!UseMI->getOperand(1).getSubReg()) {
10751094
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1095+
unsigned Size = TII->getOpSize(*UseMI, 1);
10761096
Register UseReg = OpToFold.getReg();
10771097
UseMI->getOperand(1).setReg(UseReg);
1078-
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1098+
unsigned SubRegIdx = OpToFold.getSubReg();
1099+
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
1100+
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1101+
// VS_16RegClass
1102+
//
1103+
// Excerpt from AMDGPUGenRegisterInfo.inc
1104+
// NoSubRegister, //0
1105+
// hi16, // 1
1106+
// lo16, // 2
1107+
// sub0, // 3
1108+
// ...
1109+
// sub1, // 11
1110+
// sub1_hi16, // 12
1111+
// sub1_lo16, // 13
1112+
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1113+
if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1114+
TRI->isSGPRReg(*MRI, UseReg)) {
1115+
// Produce the 32 bit subregister index to which the 16-bit subregister
1116+
// is aligned.
1117+
if (SubRegIdx > AMDGPU::sub1) {
1118+
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1119+
M |= M.getLane(M.getHighestLane() - 1);
1120+
SmallVector<unsigned, 4> Indexes;
1121+
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1122+
Indexes);
1123+
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1124+
SubRegIdx = Indexes[0];
1125+
// 32-bit registers do not have a sub0 index
1126+
} else if (TII->getOpSize(*UseMI, 1) == 4)
1127+
SubRegIdx = 0;
1128+
else
1129+
SubRegIdx = AMDGPU::sub0;
1130+
}
1131+
UseMI->getOperand(1).setSubReg(SubRegIdx);
10791132
UseMI->getOperand(1).setIsKill(false);
10801133
CopiesToReplace.push_back(UseMI);
10811134
OpToFold.setIsKill(false);
@@ -1713,6 +1766,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
17131766
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
17141767
return false;
17151768

1769+
// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
1770+
// Can remove this code if proper 16-bit SGPRs are implemented
1771+
// Example: Pre-peephole-opt
1772+
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
1773+
// %32:sreg_32 = COPY %29:sgpr_lo16
1774+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1775+
// Post-peephole-opt and DCE
1776+
// %32:sreg_32 = COPY %16.lo16:sreg_32
1777+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1778+
// After this transform
1779+
// %32:sreg_32 = COPY %16:sreg_32
1780+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1781+
// After the fold operands pass
1782+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
1783+
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
1784+
OpToFold.getSubReg()) {
1785+
const TargetRegisterClass *DstRC =
1786+
MRI->getRegClass(MI.getOperand(0).getReg());
1787+
if (DstRC == &AMDGPU::SReg_32RegClass &&
1788+
DstRC == MRI->getRegClass(OpToFold.getReg())) {
1789+
assert(OpToFold.getSubReg() == AMDGPU::lo16);
1790+
OpToFold.setSubReg(0);
1791+
}
1792+
}
1793+
17161794
// Prevent folding operands backwards in the function. For example,
17171795
// the COPY opcode must not be replaced by 1 in this example:
17181796
//

0 commit comments

Comments
 (0)