Skip to content

Commit 06ddf3c

Browse files
[AMDGPU] Allow folding of non-subregs through REG_SEQUENCE
1 parent e229857 commit 06ddf3c

File tree

4 files changed

+453
-10
lines changed

4 files changed

+453
-10
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,8 @@ class SIFoldOperandsImpl {
240240
SmallVectorImpl<FoldCandidate> &FoldList) const;
241241
void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
242242
SmallVectorImpl<FoldCandidate> &FoldList,
243-
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
243+
SmallVectorImpl<MachineInstr *> &CopiesToReplace,
244+
bool RegSeqSourceWasSubreg) const;
244245

245246
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246247
bool tryConstantFoldOp(MachineInstr *MI) const;
@@ -712,8 +713,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
712713
TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
713714
const TargetRegisterClass *NewRC =
714715
TRI->getRegClassForReg(*MRI, New->getReg());
715-
const TargetRegisterClass *ConstrainRC =
716-
TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg());
716+
717+
const TargetRegisterClass *ConstrainRC = OpRC;
718+
if (New->getSubReg())
719+
ConstrainRC =
720+
TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
721+
717722
if (!ConstrainRC)
718723
return false;
719724

@@ -726,7 +731,9 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
726731

727732
// Rework once the VS_16 register class is updated to include proper
728733
// 16-bit SGPRs instead of 32-bit ones.
729-
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
734+
if ((Old.getSubReg() == AMDGPU::lo16 &&
735+
TRI->isSGPRReg(*MRI, New->getReg())) ||
736+
!New->getSubReg())
730737
Old.setSubReg(AMDGPU::NoSubRegister);
731738
if (New->getReg().isPhysical()) {
732739
Old.substPhysReg(New->getReg(), *TRI);
@@ -1161,7 +1168,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
11611168
void SIFoldOperandsImpl::foldOperand(
11621169
FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
11631170
SmallVectorImpl<FoldCandidate> &FoldList,
1164-
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1171+
SmallVectorImpl<MachineInstr *> &CopiesToReplace,
1172+
bool RegSeqSourceWasSubreg = true) const {
11651173
const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
11661174

11671175
if (!isUseSafeToFold(*UseMI, *UseOp))
@@ -1171,10 +1179,12 @@ void SIFoldOperandsImpl::foldOperand(
11711179
if (UseOp->isReg() && OpToFold.isReg()) {
11721180
if (UseOp->isImplicit())
11731181
return;
1174-
// Allow folding from SGPRs to 16-bit VGPRs.
1182+
// Allow folding from SGPRs to 16-bit VGPRs
1183+
// or folding of non-subregs through REG_SEQUENCES.
11751184
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
11761185
(UseOp->getSubReg() != AMDGPU::lo16 ||
1177-
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1186+
!TRI->isSGPRReg(*MRI, OpToFold.getReg())) &&
1187+
RegSeqSourceWasSubreg)
11781188
return;
11791189
}
11801190

@@ -1215,10 +1225,12 @@ void SIFoldOperandsImpl::foldOperand(
12151225
if (RSUse->getSubReg() != RegSeqDstSubReg)
12161226
continue;
12171227

1228+
RegSeqSourceWasSubreg = (UseOp->getSubReg() != AMDGPU::NoSubRegister) &&
1229+
RegSeqSourceWasSubreg;
12181230
// FIXME: We should avoid recursing here. There should be a cleaner split
12191231
// between the in-place mutations and adding to the fold list.
12201232
foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1221-
CopiesToReplace);
1233+
CopiesToReplace, RegSeqSourceWasSubreg);
12221234
}
12231235

12241236
return;
@@ -1465,6 +1477,33 @@ void SIFoldOperandsImpl::foldOperand(
14651477
return;
14661478
}
14671479

1480+
if (!FoldingImmLike && OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1481+
unsigned Opc = UseMI->getOpcode();
1482+
// Special case for DS_GWS instructions that only use 32 bits but hardware
1483+
// treats it as a 64 bit read.
1484+
if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
1485+
Opc == AMDGPU::DS_GWS_BARRIER) {
1486+
const TargetRegisterClass *RC =
1487+
TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1488+
assert(RC);
1489+
1490+
const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
1491+
this](AMDGPU::OpName OpName) -> bool {
1492+
const MachineOperand *Op = TII->getNamedOperand(*UseMI, OpName);
1493+
if (Op != UseOp)
1494+
return true;
1495+
Register Reg = OpToFold.getReg();
1496+
assert(!Reg.isPhysical());
1497+
return TRI->getRegSizeInBits(*RC) > 32 &&
1498+
!(TRI->getChannelFromSubReg(OpToFold.getSubReg()) & 1) &&
1499+
TRI->isProperlyAlignedRC(*RC);
1500+
};
1501+
1502+
if (!isAlignedReg(AMDGPU::OpName::data0))
1503+
return;
1504+
}
1505+
}
1506+
14681507
// FIXME: We could try to change the instruction from 64-bit to 32-bit
14691508
// to enable more folding opportunities. The shrink operands pass
14701509
// already does this.

0 commit comments

Comments
 (0)