@@ -1123,24 +1123,85 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
11231123 }
11241124}
11251125
1126- // FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
1127- // where a consecutive multi-vector tuple is constructed from the same indices
1128- // of multiple strided loads. This may still result in unnecessary copies
1129- // between the loads and the tuple. Here we try to return a hint to assign the
1130- // contiguous ZPRMulReg starting at the same register as the first operand of
1131- // the pseudo, which should be a subregister of the first strided load.
1126+ // We add regalloc hints for different cases:
1127+ // * Choosing a better destination operand for predicated SVE instructions
1128+ // where the inactive lanes are undef, by choosing a register that is not
1129+ // unique to the other operands of the instruction.
11321130//
1133- // For example, if the first strided load has been assigned $z16_z20_z24_z28
1134- // and the operands of the pseudo are each accessing subregister zsub2, we
1135- // should look through through Order to find a contiguous register which
1136- // begins with $z24 (i.e. $z24_z25_z26_z27).
1131+ // * Improve register allocation for SME multi-vector instructions where we can
1132+ // benefit from the strided- and contiguous register multi-vector tuples.
11371133//
1134+ // Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
1135+ // allocation where a consecutive multi-vector tuple is constructed from the
1136+ // same indices of multiple strided loads. This may still result in
1137+ // unnecessary copies between the loads and the tuple. Here we try to return a
1138+ // hint to assign the contiguous ZPRMulReg starting at the same register as
1139+ // the first operand of the pseudo, which should be a subregister of the first
1140+ // strided load.
1141+ //
1142+ // For example, if the first strided load has been assigned $z16_z20_z24_z28
1143+ // and the operands of the pseudo are each accessing subregister zsub2, we
1144+ // should look through through Order to find a contiguous register which
1145+ // begins with $z24 (i.e. $z24_z25_z26_z27).
11381146bool AArch64RegisterInfo::getRegAllocationHints (
11391147 Register VirtReg, ArrayRef<MCPhysReg> Order,
11401148 SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
11411149 const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
1142-
11431150 auto &ST = MF.getSubtarget <AArch64Subtarget>();
1151+ const AArch64InstrInfo *TII =
1152+ MF.getSubtarget <AArch64Subtarget>().getInstrInfo ();
1153+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
1154+
1155+ // For predicated SVE instructions where the inactive lanes are undef,
1156+ // pick a destination register that is not unique to avoid introducing
1157+ // a movprfx.
1158+ const TargetRegisterClass *RegRC = MRI.getRegClass (VirtReg);
1159+ if (AArch64::ZPRRegClass.hasSubClassEq (RegRC)) {
1160+ for (const MachineOperand &DefOp : MRI.def_operands (VirtReg)) {
1161+ const MachineInstr &Def = *DefOp.getParent ();
1162+ if (DefOp.isImplicit () ||
1163+ (TII->get (Def.getOpcode ()).TSFlags & AArch64::FalseLanesMask) !=
1164+ AArch64::FalseLanesUndef)
1165+ continue ;
1166+
1167+ unsigned InstFlags =
1168+ TII->get (AArch64::getSVEPseudoMap (Def.getOpcode ())).TSFlags ;
1169+
1170+ for (MCPhysReg R : Order) {
1171+ auto AddHintIfSuitable = [&](MCPhysReg R, const MachineOperand &MO) {
1172+ // R is a suitable register hint if there exists an operand for the
1173+ // instruction that is not yet allocated a register or if R matches
1174+ // one of the other source operands.
1175+ if (!VRM->hasPhys (MO.getReg ()) || VRM->getPhys (MO.getReg ()) == R)
1176+ Hints.push_back (R);
1177+ };
1178+
1179+ switch (InstFlags & AArch64::DestructiveInstTypeMask) {
1180+ default :
1181+ break ;
1182+ case AArch64::DestructiveTernaryCommWithRev:
1183+ AddHintIfSuitable (R, Def.getOperand (2 ));
1184+ AddHintIfSuitable (R, Def.getOperand (3 ));
1185+ AddHintIfSuitable (R, Def.getOperand (4 ));
1186+ break ;
1187+ case AArch64::DestructiveBinaryComm:
1188+ case AArch64::DestructiveBinaryCommWithRev:
1189+ AddHintIfSuitable (R, Def.getOperand (2 ));
1190+ AddHintIfSuitable (R, Def.getOperand (3 ));
1191+ break ;
1192+ case AArch64::DestructiveBinary:
1193+ case AArch64::DestructiveBinaryImm:
1194+ AddHintIfSuitable (R, Def.getOperand (2 ));
1195+ break ;
1196+ }
1197+ }
1198+ }
1199+
1200+ if (Hints.size ())
1201+ return TargetRegisterInfo::getRegAllocationHints (VirtReg, Order, Hints,
1202+ MF, VRM);
1203+ }
1204+
11441205 if (!ST.hasSME () || !ST.isStreaming ())
11451206 return TargetRegisterInfo::getRegAllocationHints (VirtReg, Order, Hints, MF,
11461207 VRM);
@@ -1153,8 +1214,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
11531214 // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
11541215 // instructions over reducing the number of clobbered callee-save registers,
11551216 // so we add the strided registers as a hint.
1156- const MachineRegisterInfo &MRI = MF.getRegInfo ();
1157- unsigned RegID = MRI.getRegClass (VirtReg)->getID ();
1217+ unsigned RegID = RegRC->getID ();
11581218 if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
11591219 RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
11601220
0 commit comments