@@ -1123,11 +1123,18 @@ bool AArch64RegisterInfo::getRegAllocationHints(
11231123 Use.getOpcode () != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
11241124 continue ;
11251125
1126- unsigned LdOps = Use.getNumOperands () - 1 ;
1127- const TargetRegisterClass *StridedRC =
1128- RegID == AArch64::ZPR2StridedOrContiguousRegClassID
1129- ? &AArch64::ZPR2StridedRegClass
1130- : &AArch64::ZPR4StridedRegClass;
1126+ unsigned UseOps = Use.getNumOperands () - 1 ;
1127+ const TargetRegisterClass *StridedRC;
1128+ switch (RegID) {
1129+ case AArch64::ZPR2StridedOrContiguousRegClassID:
1130+ StridedRC = &AArch64::ZPR2StridedRegClass;
1131+ break ;
1132+ case AArch64::ZPR4StridedOrContiguousRegClassID:
1133+ StridedRC = &AArch64::ZPR4StridedRegClass;
1134+ break ;
1135+ default :
1136+ llvm_unreachable (" Unexpected RegID" );
1137+ }
11311138
11321139 SmallVector<MCPhysReg, 4 > StridedOrder;
11331140 for (MCPhysReg Reg : Order)
@@ -1147,32 +1154,67 @@ bool AArch64RegisterInfo::getRegAllocationHints(
11471154 return VRM->hasPhys (Op.getReg ());
11481155 });
11491156
1157+ // Example:
1158+ //
1159+ // When trying to find a suitable register allocation for VirtReg %v2 in:
1160+ //
1161+ // %v0:zpr2stridedorcontiguous = ld1 p0/z, [...]
1162+ // %v1:zpr2stridedorcontiguous = ld1 p0/z, [...]
1163+ // %v2:zpr2stridedorcontiguous = ld1 p0/z, [...]
1164+ // %v3:zpr2stridedorcontiguous = ld1 p0/z, [...]
1165+ // %v4:zpr4mul4 = FORM_TRANSPOSED_X4 %v0:0, %v1:0, %v2:0, %v3:0
1166+ //
1167+ // One such suitable allocation would be:
1168+ //
1169+ // { z0, z8 } = ld1 p0/z, [...]
1170+ // { z1, z9 } = ld1 p0/z, [...]
1171+ // { z2, z10 } = ld1 p0/z, [...]
1172+ // { z3, z11 } = ld1 p0/z, [...]
1173+ // { z0, z1, z2, z3 } =
1174+ // FORM_TRANSPOSED_X4 {z0, z8}:0, {z1, z9}:0, {z2, z10}:0, {z3, z11}:0
1175+ //
1176+ // Below we distinguish two cases when trying to find a register:
1177+ // * None of the registers used by FORM_TRANSPOSED_X4 have been assigned
1178+ // yet. In this case the code muse ensure that there are at least UseOps
1179+ // free consecutive registers. If IsMulZPR is true, then the first of
1180+ // registers must also be a multiple of UseOps, e.g. { z0, z1, z2, z3 }
1181+ // is valid but { z1, z2, z3, z5 } is not.
1182+ // * One or more of the registers used by FORM_TRANSPOSED_X4 is already
1183+ // assigned a physical register, which means only checking that a
1184+ // consectutive range of free tuple registers exists which includes
1185+ // the assigned register.
1186+ // e.g. in the example above, if { z0, z8 } is already allocated for
1187+ // %v0, we just need to ensure that { z1, z9 }, { z2, z10 } and
1188+ // { z3, z11 } are also free. If so, we add { z2, z10 }.
1189+
11501190 if (AssignedRegOp == Use.operands_end ()) {
11511191 // There are no registers already assigned to any of the pseudo
11521192 // operands. Look for a valid starting register for the group.
11531193 for (unsigned I = 0 ; I < StridedOrder.size (); ++I) {
11541194 MCPhysReg Reg = StridedOrder[I];
11551195 SmallVector<MCPhysReg> Regs;
1156- unsigned FirstStridedReg = Reg - OpIdx + 1 ;
11571196
11581197 // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting
11591198 // register of the first load should be a multiple of 2 or 4.
1160- unsigned FirstSubReg = getSubReg (FirstStridedReg, AArch64::zsub0);
1161- if (IsMulZPR && (FirstSubReg - AArch64::Z0) % LdOps != 0 )
1199+ unsigned SubRegIdx = Use.getOperand (OpIdx).getSubReg ();
1200+ if (IsMulZPR && (getSubReg (Reg, SubRegIdx) - AArch64::Z0) % UseOps !=
1201+ ((unsigned )OpIdx - 1 ))
11621202 continue ;
11631203
1164- for (unsigned Op = 0 ; Op < LdOps; ++Op) {
1165- if (!is_contained (StridedOrder, FirstStridedReg + Op) ||
1166- getSubReg (FirstStridedReg + Op, AArch64::zsub0) !=
1167- FirstSubReg + Op)
1168- break ;
1169- Regs.push_back (FirstStridedReg + Op);
1170- }
1171-
1172- if (Regs.size () == LdOps && all_of (Regs, [&](MCPhysReg R) {
1173- return !Matrix->isPhysRegUsed (R);
1174- }))
1175- Hints.push_back (FirstStridedReg + OpIdx - 1 );
1204+ // In the example above, if VirtReg is the third operand of the
1205+ // tuple (%v2) and Reg == Z2_Z10, then we need to make sure that
1206+ // Z0_Z8, Z1_Z9 and Z3_Z11 are also available.
1207+ auto IsFreeConsecutiveReg = [&](unsigned UseOp) {
1208+ unsigned R = Reg - (OpIdx - 1 ) + UseOp;
1209+ return StridedRC->contains (R) &&
1210+ (UseOp == 0 ||
1211+ ((getSubReg (R, AArch64::zsub0) - AArch64::Z0) ==
1212+ (getSubReg (R - 1 , AArch64::zsub0) - AArch64::Z0) + 1 )) &&
1213+ !Matrix->isPhysRegUsed (R);
1214+ };
1215+ if (all_of (iota_range<unsigned >(0U , UseOps, /* Inclusive=*/ false ),
1216+ IsFreeConsecutiveReg))
1217+ Hints.push_back (Reg);
11761218 }
11771219 } else {
11781220 // At least one operand already has a physical register assigned.
0 commit comments