@@ -240,7 +240,8 @@ class SIFoldOperandsImpl {
240240 SmallVectorImpl<FoldCandidate> &FoldList) const ;
241241 void foldOperand (FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
242242 SmallVectorImpl<FoldCandidate> &FoldList,
243- SmallVectorImpl<MachineInstr *> &CopiesToReplace) const ;
243+ SmallVectorImpl<MachineInstr *> &CopiesToReplace,
244+ bool RegSeqSourceWasSubreg) const ;
244245
245246 std::optional<int64_t > getImmOrMaterializedImm (MachineOperand &Op) const ;
246247 bool tryConstantFoldOp (MachineInstr *MI) const ;
@@ -712,8 +713,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
712713 TII->getRegClass (MI->getDesc (), Fold.UseOpNo , TRI)) {
713714 const TargetRegisterClass *NewRC =
714715 TRI->getRegClassForReg (*MRI, New->getReg ());
715- const TargetRegisterClass *ConstrainRC =
716- TRI->findCommonRegClass (OpRC, Old.getSubReg (), NewRC, New->getSubReg ());
716+
717+ const TargetRegisterClass *ConstrainRC = OpRC;
718+ if (New->getSubReg ())
719+ ConstrainRC =
720+ TRI->getMatchingSuperRegClass (NewRC, OpRC, New->getSubReg ());
721+
717722 if (!ConstrainRC)
718723 return false ;
719724
@@ -726,7 +731,9 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
726731
727732 // Rework once the VS_16 register class is updated to include proper
728733 // 16-bit SGPRs instead of 32-bit ones.
729- if (Old.getSubReg () == AMDGPU::lo16 && TRI->isSGPRReg (*MRI, New->getReg ()))
734+ if ((Old.getSubReg () == AMDGPU::lo16 &&
735+ TRI->isSGPRReg (*MRI, New->getReg ())) ||
736+ !New->getSubReg ())
730737 Old.setSubReg (AMDGPU::NoSubRegister);
731738 if (New->getReg ().isPhysical ()) {
732739 Old.substPhysReg (New->getReg (), *TRI);
@@ -1161,7 +1168,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
11611168void SIFoldOperandsImpl::foldOperand (
11621169 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
11631170 SmallVectorImpl<FoldCandidate> &FoldList,
1164- SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1171+ SmallVectorImpl<MachineInstr *> &CopiesToReplace,
1172+ bool RegSeqSourceWasSubreg = true ) const {
11651173 const MachineOperand *UseOp = &UseMI->getOperand (UseOpIdx);
11661174
11671175 if (!isUseSafeToFold (*UseMI, *UseOp))
@@ -1171,10 +1179,12 @@ void SIFoldOperandsImpl::foldOperand(
11711179 if (UseOp->isReg () && OpToFold.isReg ()) {
11721180 if (UseOp->isImplicit ())
11731181 return ;
1174- // Allow folding from SGPRs to 16-bit VGPRs.
1182+ // Allow folding from SGPRs to 16-bit VGPRs
1183+ // or folding of non-subregs through REG_SEQUENCES.
11751184 if (UseOp->getSubReg () != AMDGPU::NoSubRegister &&
11761185 (UseOp->getSubReg () != AMDGPU::lo16 ||
1177- !TRI->isSGPRReg (*MRI, OpToFold.getReg ())))
1186+ !TRI->isSGPRReg (*MRI, OpToFold.getReg ())) &&
1187+ RegSeqSourceWasSubreg)
11781188 return ;
11791189 }
11801190
@@ -1215,10 +1225,12 @@ void SIFoldOperandsImpl::foldOperand(
12151225 if (RSUse->getSubReg () != RegSeqDstSubReg)
12161226 continue ;
12171227
1228+ RegSeqSourceWasSubreg = (UseOp->getSubReg () != AMDGPU::NoSubRegister) &&
1229+ RegSeqSourceWasSubreg;
12181230 // FIXME: We should avoid recursing here. There should be a cleaner split
12191231 // between the in-place mutations and adding to the fold list.
12201232 foldOperand (OpToFold, RSUseMI, RSUseMI->getOperandNo (RSUse), FoldList,
1221- CopiesToReplace);
1233+ CopiesToReplace, RegSeqSourceWasSubreg );
12221234 }
12231235
12241236 return ;
@@ -1465,6 +1477,33 @@ void SIFoldOperandsImpl::foldOperand(
14651477 return ;
14661478 }
14671479
1480+ if (!FoldingImmLike && OpToFold.isReg () && ST->needsAlignedVGPRs ()) {
1481+ unsigned Opc = UseMI->getOpcode ();
1482+ // Special case for DS_GWS instructions that only use 32 bits but hardware
1483+ // treats it as a 64 bit read.
1484+ if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
1485+ Opc == AMDGPU::DS_GWS_BARRIER) {
1486+ const TargetRegisterClass *RC =
1487+ TRI->getRegClassForReg (*MRI, OpToFold.getReg ());
1488+ assert (RC);
1489+
1490+ const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
1491+ this ](AMDGPU::OpName OpName) -> bool {
1492+ const MachineOperand *Op = TII->getNamedOperand (*UseMI, OpName);
1493+ if (Op != UseOp)
1494+ return true ;
1495+ Register Reg = OpToFold.getReg ();
1496+ assert (!Reg.isPhysical ());
1497+ return TRI->getRegSizeInBits (*RC) > 32 &&
1498+ !(TRI->getChannelFromSubReg (OpToFold.getSubReg ()) & 1 ) &&
1499+ TRI->isProperlyAlignedRC (*RC);
1500+ };
1501+
1502+ if (!isAlignedReg (AMDGPU::OpName::data0))
1503+ return ;
1504+ }
1505+ }
1506+
14681507 // FIXME: We could try to change the instruction from 64-bit to 32-bit
14691508 // to enable more folding opportunities. The shrink operands pass
14701509 // already does this.
0 commit comments