@@ -730,14 +730,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
730730 }
731731 }
732732
733- // Rework once the VS_16 register class is updated to include proper
734- // 16-bit SGPRs instead of 32-bit ones.
735- if (Old.getSubReg () == AMDGPU::lo16 && TRI->isSGPRReg (*MRI, New->getReg ()))
736- Old.setSubReg (AMDGPU::NoSubRegister);
733+ Old.setSubReg (New->getSubReg ());
737734 if (New->getReg ().isPhysical ()) {
738735 Old.substPhysReg (New->getReg (), *TRI);
739736 } else {
740- Old.substVirtReg (New->getReg (), New-> getSubReg () , *TRI);
737+ Old.substVirtReg (New->getReg (), 0 , *TRI);
741738 Old.setIsUndef (New->isUndef ());
742739 }
743740 return true ;
@@ -1150,10 +1147,14 @@ void SIFoldOperandsImpl::foldOperand(
11501147 if (UseOp->isReg () && OpToFold.isReg ()) {
11511148 if (UseOp->isImplicit ())
11521149 return ;
1153- // Allow folding from SGPRs to 16-bit VGPRs.
1150+
1151+ MachineInstr *SourceInstruction = MRI->getVRegDef (UseOp->getReg ());
1152+ // Allow folding from SGPRs to 16-bit VGPRs
1153+ // or folding of non-subregs through REG_SEQUENCES.
11541154 if (UseOp->getSubReg () != AMDGPU::NoSubRegister &&
11551155 (UseOp->getSubReg () != AMDGPU::lo16 ||
1156- !TRI->isSGPRReg (*MRI, OpToFold.getReg ())))
1156+ !TRI->isSGPRReg (*MRI, OpToFold.getReg ())) &&
1157+ !SourceInstruction->isRegSequence ())
11571158 return ;
11581159 }
11591160
@@ -1452,6 +1453,33 @@ void SIFoldOperandsImpl::foldOperand(
14521453 return ;
14531454 }
14541455
1456+ if (!FoldingImmLike && OpToFold.isReg () && ST->needsAlignedVGPRs ()) {
1457+ unsigned Opc = UseMI->getOpcode ();
1458+ // Special case for DS_GWS instructions that only use 32 bits but hardware
1459+ // treats it as a 64 bit read.
1460+ if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
1461+ Opc == AMDGPU::DS_GWS_BARRIER) {
1462+ const TargetRegisterClass *RC =
1463+ TRI->getRegClassForReg (*MRI, OpToFold.getReg ());
1464+ assert (RC);
1465+
1466+ const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
1467+ this ](AMDGPU::OpName OpName) -> bool {
1468+ const MachineOperand *Op = TII->getNamedOperand (*UseMI, OpName);
1469+ if (Op != UseOp)
1470+ return true ;
1471+ Register Reg = OpToFold.getReg ();
1472+ assert (!Reg.isPhysical ());
1473+ return TRI->getRegSizeInBits (*RC) > 32 &&
1474+ !(TRI->getChannelFromSubReg (OpToFold.getSubReg ()) & 1 ) &&
1475+ TRI->isProperlyAlignedRC (*RC);
1476+ };
1477+
1478+ if (!isAlignedReg (AMDGPU::OpName::data0))
1479+ return ;
1480+ }
1481+ }
1482+
14551483 // FIXME: We could try to change the instruction from 64-bit to 32-bit
14561484 // to enable more folding opportunities. The shrink operands pass
14571485 // already does this.
0 commit comments