Skip to content

Commit 739fc21

Browse files
committed
16bit sgpr folding
1 parent 74306af commit 739fc21

File tree

1 file changed

+89
-12
lines changed

1 file changed

+89
-12
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 89 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
#include "AMDGPU.h"
1313
#include "GCNSubtarget.h"
1414
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15+
#include "SIInstrInfo.h"
1516
#include "SIMachineFunctionInfo.h"
17+
#include "SIRegisterInfo.h"
1618
#include "llvm/ADT/DepthFirstIterator.h"
19+
#include "llvm/CodeGen/MachineFunction.h"
1720
#include "llvm/CodeGen/MachineFunctionPass.h"
1821
#include "llvm/CodeGen/MachineOperand.h"
1922

@@ -570,6 +573,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
570573
}
571574

572575
MachineOperand *New = Fold.OpToFold;
576+
// TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
577+
// Rework once the VS_16 register class is updated to include proper
578+
// 16-bit SGPRs instead of 32-bit ones.
579+
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
580+
Old.setSubReg(AMDGPU::NoSubRegister);
573581
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
574582
Old.setIsUndef(New->isUndef());
575583
return true;
@@ -875,9 +883,14 @@ void SIFoldOperandsImpl::foldOperand(
875883
return;
876884

877885
// FIXME: Fold operands with subregs.
878-
if (UseOp->isReg() && OpToFold.isReg() &&
879-
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
880-
return;
886+
if (UseOp->isReg() && OpToFold.isReg()) {
887+
if (UseOp->isImplicit())
888+
return;
889+
// Allow folding from SGPRs to 16-bit VGPRs.
890+
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
891+
UseOp->getSubReg() != AMDGPU::lo16)
892+
return;
893+
}
881894

882895
// Special case for REG_SEQUENCE: We can't fold literals into
883896
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -958,14 +971,20 @@ void SIFoldOperandsImpl::foldOperand(
958971
return;
959972

960973
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
961-
if (!DestReg.isPhysical()) {
962-
if (DestRC == &AMDGPU::AGPR_32RegClass &&
963-
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
964-
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
965-
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
966-
CopiesToReplace.push_back(UseMI);
967-
return;
968-
}
974+
if (DestRC == &AMDGPU::AGPR_32RegClass &&
975+
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
976+
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
977+
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
978+
CopiesToReplace.push_back(UseMI);
979+
return;
980+
}
981+
982+
// Allow immediates COPYd into sgpr_lo16 to be further folded while
983+
// still being legal if not further folded
984+
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
985+
assert(ST->useRealTrue16Insts());
986+
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
987+
DestRC = &AMDGPU::SGPR_32RegClass;
969988
}
970989

971990
// In order to fold immediates into copies, we need to change the
@@ -1004,7 +1023,40 @@ void SIFoldOperandsImpl::foldOperand(
10041023
unsigned Size = TII->getOpSize(*UseMI, 1);
10051024
Register UseReg = OpToFold.getReg();
10061025
UseMI->getOperand(1).setReg(UseReg);
1007-
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1026+
unsigned SubRegIdx = OpToFold.getSubReg();
1027+
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
1028+
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1029+
// VS_16RegClass
1030+
//
1031+
// Excerpt from AMDGPUGenRegisterInfo.inc
1032+
// NoSubRegister, //0
1033+
// hi16, // 1
1034+
// lo16, // 2
1035+
// sub0, // 3
1036+
// ...
1037+
// sub1, // 11
1038+
// sub1_hi16, // 12
1039+
// sub1_lo16, // 13
1040+
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1041+
if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1042+
TRI->isSGPRReg(*MRI, UseReg)) {
1043+
// Produce the 32 bit subregister index to which the 16-bit subregister
1044+
// is aligned.
1045+
if (SubRegIdx > AMDGPU::sub1) {
1046+
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1047+
M |= M.getLane(M.getHighestLane() - 1);
1048+
SmallVector<unsigned, 4> Indexes;
1049+
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1050+
Indexes);
1051+
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1052+
SubRegIdx = Indexes[0];
1053+
// 32-bit registers do not have a sub0 index
1054+
} else if (TII->getOpSize(*UseMI, 1) == 4)
1055+
SubRegIdx = 0;
1056+
else
1057+
SubRegIdx = AMDGPU::sub0;
1058+
}
1059+
UseMI->getOperand(1).setSubReg(SubRegIdx);
10081060
UseMI->getOperand(1).setIsKill(false);
10091061
CopiesToReplace.push_back(UseMI);
10101062
OpToFold.setIsKill(false);
@@ -1584,6 +1636,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
15841636
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
15851637
return false;
15861638

1639+
// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
1640+
// Can remove this code if proper 16-bit SGPRs are implemented
1641+
// Example: Pre-peephole-opt
1642+
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
1643+
// %32:sreg_32 = COPY %29:sgpr_lo16
1644+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1645+
// Post-peephole-opt and DCE
1646+
// %32:sreg_32 = COPY %16.lo16:sreg_32
1647+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1648+
// After this transform
1649+
// %32:sreg_32 = COPY %16:sreg_32
1650+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1651+
// After the fold operands pass
1652+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
1653+
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
1654+
OpToFold.getSubReg()) {
1655+
const TargetRegisterClass *DstRC =
1656+
MRI->getRegClass(MI.getOperand(0).getReg());
1657+
if (DstRC == &AMDGPU::SReg_32RegClass &&
1658+
DstRC == MRI->getRegClass(OpToFold.getReg())) {
1659+
assert(OpToFold.getSubReg() == AMDGPU::lo16);
1660+
OpToFold.setSubReg(0);
1661+
}
1662+
}
1663+
15871664
// Prevent folding operands backwards in the function. For example,
15881665
// the COPY opcode must not be replaced by 1 in this example:
15891666
//

0 commit comments

Comments
 (0)