Skip to content

Commit 4275fe9

Browse files
[AMDGPU] Allow folding of non-subregs through REG_SEQUENCE
1 parent f20619c commit 4275fe9

File tree

4 files changed

+442
-9
lines changed

4 files changed

+442
-9
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -730,14 +730,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
730730
}
731731
}
732732

733-
// Rework once the VS_16 register class is updated to include proper
734-
// 16-bit SGPRs instead of 32-bit ones.
735-
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
736-
Old.setSubReg(AMDGPU::NoSubRegister);
733+
Old.setSubReg(New->getSubReg());
737734
if (New->getReg().isPhysical()) {
738735
Old.substPhysReg(New->getReg(), *TRI);
739736
} else {
740-
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
737+
Old.substVirtReg(New->getReg(), 0, *TRI);
741738
Old.setIsUndef(New->isUndef());
742739
}
743740
return true;
@@ -1150,10 +1147,14 @@ void SIFoldOperandsImpl::foldOperand(
11501147
if (UseOp->isReg() && OpToFold.isReg()) {
11511148
if (UseOp->isImplicit())
11521149
return;
1153-
// Allow folding from SGPRs to 16-bit VGPRs.
1150+
1151+
MachineInstr *SourceInstruction = MRI->getVRegDef(UseOp->getReg());
1152+
// Allow folding from SGPRs to 16-bit VGPRs
1153+
// or folding of non-subregs through REG_SEQUENCES.
11541154
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
11551155
(UseOp->getSubReg() != AMDGPU::lo16 ||
1156-
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1156+
!TRI->isSGPRReg(*MRI, OpToFold.getReg())) &&
1157+
!SourceInstruction->isRegSequence())
11571158
return;
11581159
}
11591160

@@ -1452,6 +1453,33 @@ void SIFoldOperandsImpl::foldOperand(
14521453
return;
14531454
}
14541455

1456+
if (!FoldingImmLike && OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1457+
unsigned Opc = UseMI->getOpcode();
1458+
// Special case for DS_GWS instructions that only use 32 bits but hardware
1459+
// treats it as a 64 bit read.
1460+
if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
1461+
Opc == AMDGPU::DS_GWS_BARRIER) {
1462+
const TargetRegisterClass *RC =
1463+
TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1464+
assert(RC);
1465+
1466+
const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
1467+
this](AMDGPU::OpName OpName) -> bool {
1468+
const MachineOperand *Op = TII->getNamedOperand(*UseMI, OpName);
1469+
if (Op != UseOp)
1470+
return true;
1471+
Register Reg = OpToFold.getReg();
1472+
assert(!Reg.isPhysical());
1473+
return TRI->getRegSizeInBits(*RC) > 32 &&
1474+
!(TRI->getChannelFromSubReg(OpToFold.getSubReg()) & 1) &&
1475+
TRI->isProperlyAlignedRC(*RC);
1476+
};
1477+
1478+
if (!isAlignedReg(AMDGPU::OpName::data0))
1479+
return;
1480+
}
1481+
}
1482+
14551483
// FIXME: We could try to change the instruction from 64-bit to 32-bit
14561484
// to enable more folding opportunities. The shrink operands pass
14571485
// already does this.

0 commit comments

Comments
 (0)