Skip to content

Commit 7fe60a7

Browse files
[AArch64][SVE] Avoid movprfx by reusing register for _UNDEF pseudos. (#166926)
For predicated SVE instructions where we know that the inactive lanes are undef, it is better to pick a destination register that is not unique. This avoids introducing a movprfx to copy a unique register to the destination operand, which would be needed to comply with the tied operand constraints. For example: ``` %src1 = COPY $z1 %src2 = COPY $z2 %dst = SDIV_ZPZZ_S_UNDEF %p, %src1, %src2 ``` Here it is beneficial to pick $z1 or $z2 as the destination register, because if it would have chosen a unique register (e.g. $z0) then the pseudo expand pass would need to insert a MOVPRFX to expand the operation into: ``` $z0 = SDIV_ZPZZ_S_UNDEF $p0, $z1, $z2 -> $z0 = MOVPRFX $z1 $z0 = SDIV_ZPmZ_S $p0, $z0, $z2 ``` By picking $z1 directly, we'd get: ``` $z1 = SDIV_ZPmZ_S, $p0 $z1, $z2 ```
1 parent e1f8690 commit 7fe60a7

30 files changed

+417
-527
lines changed

llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,24 +1123,85 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
11231123
}
11241124
}
11251125

1126-
// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
1127-
// where a consecutive multi-vector tuple is constructed from the same indices
1128-
// of multiple strided loads. This may still result in unnecessary copies
1129-
// between the loads and the tuple. Here we try to return a hint to assign the
1130-
// contiguous ZPRMulReg starting at the same register as the first operand of
1131-
// the pseudo, which should be a subregister of the first strided load.
1126+
// We add regalloc hints for different cases:
1127+
// * Choosing a better destination operand for predicated SVE instructions
1128+
// where the inactive lanes are undef, by choosing a register that is not
1129+
// unique to the other operands of the instruction.
11321130
//
1133-
// For example, if the first strided load has been assigned $z16_z20_z24_z28
1134-
// and the operands of the pseudo are each accessing subregister zsub2, we
1135-
// should look through through Order to find a contiguous register which
1136-
// begins with $z24 (i.e. $z24_z25_z26_z27).
1131+
// * Improve register allocation for SME multi-vector instructions where we can
1132+
// benefit from the strided- and contiguous register multi-vector tuples.
11371133
//
1134+
// Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
1135+
// allocation where a consecutive multi-vector tuple is constructed from the
1136+
// same indices of multiple strided loads. This may still result in
1137+
// unnecessary copies between the loads and the tuple. Here we try to return a
1138+
// hint to assign the contiguous ZPRMulReg starting at the same register as
1139+
// the first operand of the pseudo, which should be a subregister of the first
1140+
// strided load.
1141+
//
1142+
// For example, if the first strided load has been assigned $z16_z20_z24_z28
1143+
// and the operands of the pseudo are each accessing subregister zsub2, we
1144+
// should look through through Order to find a contiguous register which
1145+
// begins with $z24 (i.e. $z24_z25_z26_z27).
11381146
bool AArch64RegisterInfo::getRegAllocationHints(
11391147
Register VirtReg, ArrayRef<MCPhysReg> Order,
11401148
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
11411149
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
1142-
11431150
auto &ST = MF.getSubtarget<AArch64Subtarget>();
1151+
const AArch64InstrInfo *TII =
1152+
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
1153+
const MachineRegisterInfo &MRI = MF.getRegInfo();
1154+
1155+
// For predicated SVE instructions where the inactive lanes are undef,
1156+
// pick a destination register that is not unique to avoid introducing
1157+
// a movprfx.
1158+
const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
1159+
if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
1160+
for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
1161+
const MachineInstr &Def = *DefOp.getParent();
1162+
if (DefOp.isImplicit() ||
1163+
(TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) !=
1164+
AArch64::FalseLanesUndef)
1165+
continue;
1166+
1167+
unsigned InstFlags =
1168+
TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags;
1169+
1170+
for (MCPhysReg R : Order) {
1171+
auto AddHintIfSuitable = [&](MCPhysReg R, const MachineOperand &MO) {
1172+
// R is a suitable register hint if there exists an operand for the
1173+
// instruction that is not yet allocated a register or if R matches
1174+
// one of the other source operands.
1175+
if (!VRM->hasPhys(MO.getReg()) || VRM->getPhys(MO.getReg()) == R)
1176+
Hints.push_back(R);
1177+
};
1178+
1179+
switch (InstFlags & AArch64::DestructiveInstTypeMask) {
1180+
default:
1181+
break;
1182+
case AArch64::DestructiveTernaryCommWithRev:
1183+
AddHintIfSuitable(R, Def.getOperand(2));
1184+
AddHintIfSuitable(R, Def.getOperand(3));
1185+
AddHintIfSuitable(R, Def.getOperand(4));
1186+
break;
1187+
case AArch64::DestructiveBinaryComm:
1188+
case AArch64::DestructiveBinaryCommWithRev:
1189+
AddHintIfSuitable(R, Def.getOperand(2));
1190+
AddHintIfSuitable(R, Def.getOperand(3));
1191+
break;
1192+
case AArch64::DestructiveBinary:
1193+
case AArch64::DestructiveBinaryImm:
1194+
AddHintIfSuitable(R, Def.getOperand(2));
1195+
break;
1196+
}
1197+
}
1198+
}
1199+
1200+
if (Hints.size())
1201+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
1202+
MF, VRM);
1203+
}
1204+
11441205
if (!ST.hasSME() || !ST.isStreaming())
11451206
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
11461207
VRM);
@@ -1153,8 +1214,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
11531214
// FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
11541215
// instructions over reducing the number of clobbered callee-save registers,
11551216
// so we add the strided registers as a hint.
1156-
const MachineRegisterInfo &MRI = MF.getRegInfo();
1157-
unsigned RegID = MRI.getRegClass(VirtReg)->getID();
1217+
unsigned RegID = RegRC->getID();
11581218
if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
11591219
RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
11601220

llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,11 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
5252
; CHECK-NEXT: ptrue p0.d, vl2
5353
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
5454
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
55-
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
5655
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
56+
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
5757
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
58-
; CHECK-NEXT: movprfx z1, z2
59-
; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d
60-
; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d
58+
; CHECK-NEXT: mul z2.d, p0/m, z2.d, z3.d
59+
; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d
6160
; CHECK-NEXT: ret
6261
%div = sdiv <2 x i64> %a, %b
6362
%mul = mul <2 x i64> %c, %d

llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,12 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
1414
; CHECK-NEXT: ptrue p0.d
1515
; CHECK-NEXT: fmul z7.d, z0.d, z1.d
1616
; CHECK-NEXT: fmul z1.d, z6.d, z1.d
17-
; CHECK-NEXT: movprfx z3, z7
18-
; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z2.d
17+
; CHECK-NEXT: fmad z6.d, p0/m, z2.d, z7.d
1918
; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d
2019
; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d
2120
; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d
2221
; CHECK-NEXT: fadd z2.d, z2.d, z0.d
23-
; CHECK-NEXT: fadd z1.d, z3.d, z1.d
22+
; CHECK-NEXT: fadd z1.d, z6.d, z1.d
2423
; CHECK-NEXT: zip1 z0.d, z2.d, z1.d
2524
; CHECK-NEXT: zip2 z1.d, z2.d, z1.d
2625
; CHECK-NEXT: ret
@@ -225,17 +224,14 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
225224
; CHECK-NEXT: fmul z1.d, z25.d, z1.d
226225
; CHECK-NEXT: fmul z3.d, z4.d, z24.d
227226
; CHECK-NEXT: fmul z24.d, z5.d, z24.d
228-
; CHECK-NEXT: movprfx z7, z26
229-
; CHECK-NEXT: fmla z7.d, p0/m, z25.d, z2.d
227+
; CHECK-NEXT: fmad z25.d, p0/m, z2.d, z26.d
230228
; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d
231-
; CHECK-NEXT: movprfx z1, z3
232-
; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z5.d
233-
; CHECK-NEXT: movprfx z2, z24
234-
; CHECK-NEXT: fnmls z2.d, p0/m, z4.d, z6.d
235-
; CHECK-NEXT: fadd z2.d, z0.d, z2.d
236-
; CHECK-NEXT: fadd z1.d, z7.d, z1.d
237-
; CHECK-NEXT: zip1 z0.d, z2.d, z1.d
238-
; CHECK-NEXT: zip2 z1.d, z2.d, z1.d
229+
; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z5.d
230+
; CHECK-NEXT: fnmsb z4.d, p0/m, z6.d, z24.d
231+
; CHECK-NEXT: fadd z1.d, z0.d, z4.d
232+
; CHECK-NEXT: fadd z2.d, z25.d, z3.d
233+
; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
234+
; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
239235
; CHECK-NEXT: ret
240236
entry:
241237
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)

llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
200200
; CHECK-NEXT: fmul z3.d, z2.d, z25.d
201201
; CHECK-NEXT: fmul z25.d, z24.d, z25.d
202202
; CHECK-NEXT: fmla z3.d, p0/m, z24.d, z0.d
203-
; CHECK-NEXT: movprfx z24, z25
204-
; CHECK-NEXT: fmla z24.d, p0/m, z26.d, z1.d
205-
; CHECK-NEXT: movprfx z6, z24
206-
; CHECK-NEXT: fmla z6.d, p0/m, z5.d, z4.d
203+
; CHECK-NEXT: fmla z25.d, p0/m, z26.d, z1.d
204+
; CHECK-NEXT: fmla z25.d, p0/m, z5.d, z4.d
207205
; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z4.d
208-
; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z6.d
206+
; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z25.d
209207
; CHECK-NEXT: fmsb z1.d, p0/m, z5.d, z3.d
210208
; CHECK-NEXT: zip1 z0.d, z2.d, z1.d
211209
; CHECK-NEXT: zip2 z1.d, z2.d, z1.d

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,10 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
1717
; CHECK-NEXT: uunpklo z3.d, z3.s
1818
; CHECK-NEXT: uunpklo z1.d, z1.s
1919
; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h
20-
; CHECK-NEXT: movprfx z1, z3
21-
; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h
22-
; CHECK-NEXT: zip2 z2.d, z0.d, z1.d
23-
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
24-
; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
20+
; CHECK-NEXT: fadd z2.h, p0/m, z2.h, z3.h
21+
; CHECK-NEXT: zip2 z1.d, z0.d, z2.d
22+
; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
23+
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
2524
; CHECK-NEXT: ret
2625
entry:
2726
%a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)

llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
1818
; CHECK-NEXT: uzp2 z1.d, z1.d, z3.d
1919
; CHECK-NEXT: mul z5.d, z2.d, z0.d
2020
; CHECK-NEXT: mul z2.d, z2.d, z4.d
21-
; CHECK-NEXT: movprfx z3, z5
22-
; CHECK-NEXT: mla z3.d, p0/m, z1.d, z4.d
21+
; CHECK-NEXT: mad z4.d, p0/m, z1.d, z5.d
2322
; CHECK-NEXT: msb z0.d, p0/m, z1.d, z2.d
24-
; CHECK-NEXT: zip2 z1.d, z0.d, z3.d
25-
; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
23+
; CHECK-NEXT: zip2 z1.d, z0.d, z4.d
24+
; CHECK-NEXT: zip1 z0.d, z0.d, z4.d
2625
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
2726
; CHECK-NEXT: ret
2827
entry:

llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,11 +1148,10 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x
11481148
; CHECK-NEXT: and z3.d, z3.d, #0x3f
11491149
; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d
11501150
; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d
1151-
; CHECK-NEXT: movprfx z2, z1
1152-
; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z5.d
1151+
; CHECK-NEXT: lslr z5.d, p0/m, z5.d, z1.d
11531152
; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d
11541153
; CHECK-NEXT: orr z0.d, z4.d, z0.d
1155-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
1154+
; CHECK-NEXT: orr z1.d, z5.d, z1.d
11561155
; CHECK-NEXT: ret
11571156
%fshl = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
11581157
ret <vscale x 4 x i64> %fshl

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll

Lines changed: 24 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,9 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 {
5555
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
5656
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
5757
; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z1.h
58-
; VBITS_GE_256-NEXT: movprfx z1, z2
59-
; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h
58+
; VBITS_GE_256-NEXT: fadd z2.h, p0/m, z2.h, z3.h
6059
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
61-
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
60+
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
6261
; VBITS_GE_256-NEXT: ret
6362
;
6463
; VBITS_GE_512-LABEL: fadd_v32f16:
@@ -154,10 +153,9 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 {
154153
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
155154
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
156155
; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z1.s
157-
; VBITS_GE_256-NEXT: movprfx z1, z2
158-
; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s
156+
; VBITS_GE_256-NEXT: fadd z2.s, p0/m, z2.s, z3.s
159157
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
160-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
158+
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0]
161159
; VBITS_GE_256-NEXT: ret
162160
;
163161
; VBITS_GE_512-LABEL: fadd_v16f32:
@@ -253,10 +251,9 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 {
253251
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
254252
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
255253
; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z1.d
256-
; VBITS_GE_256-NEXT: movprfx z1, z2
257-
; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d
254+
; VBITS_GE_256-NEXT: fadd z2.d, p0/m, z2.d, z3.d
258255
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
259-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
256+
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0]
260257
; VBITS_GE_256-NEXT: ret
261258
;
262259
; VBITS_GE_512-LABEL: fadd_v8f64:
@@ -660,10 +657,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
660657
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1]
661658
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
662659
; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h
663-
; VBITS_GE_256-NEXT: movprfx z1, z5
664-
; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h
660+
; VBITS_GE_256-NEXT: fmad z3.h, p0/m, z4.h, z5.h
665661
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
666-
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
662+
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
667663
; VBITS_GE_256-NEXT: ret
668664
;
669665
; VBITS_GE_512-LABEL: fma_v32f16:
@@ -771,10 +767,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
771767
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
772768
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
773769
; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s
774-
; VBITS_GE_256-NEXT: movprfx z1, z5
775-
; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s
770+
; VBITS_GE_256-NEXT: fmad z3.s, p0/m, z4.s, z5.s
776771
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
777-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
772+
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
778773
; VBITS_GE_256-NEXT: ret
779774
;
780775
; VBITS_GE_512-LABEL: fma_v16f32:
@@ -881,10 +876,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
881876
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
882877
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
883878
; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d
884-
; VBITS_GE_256-NEXT: movprfx z1, z5
885-
; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d
879+
; VBITS_GE_256-NEXT: fmad z3.d, p0/m, z4.d, z5.d
886880
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
887-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
881+
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
888882
; VBITS_GE_256-NEXT: ret
889883
;
890884
; VBITS_GE_512-LABEL: fma_v8f64:
@@ -990,10 +984,9 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 {
990984
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
991985
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
992986
; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z1.h
993-
; VBITS_GE_256-NEXT: movprfx z1, z2
994-
; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h
987+
; VBITS_GE_256-NEXT: fmul z2.h, p0/m, z2.h, z3.h
995988
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
996-
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
989+
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
997990
; VBITS_GE_256-NEXT: ret
998991
;
999992
; VBITS_GE_512-LABEL: fmul_v32f16:
@@ -1089,10 +1082,9 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 {
10891082
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
10901083
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
10911084
; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1092-
; VBITS_GE_256-NEXT: movprfx z1, z2
1093-
; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s
1085+
; VBITS_GE_256-NEXT: fmul z2.s, p0/m, z2.s, z3.s
10941086
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1095-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1087+
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0]
10961088
; VBITS_GE_256-NEXT: ret
10971089
;
10981090
; VBITS_GE_512-LABEL: fmul_v16f32:
@@ -1188,10 +1180,9 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 {
11881180
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
11891181
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
11901182
; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1191-
; VBITS_GE_256-NEXT: movprfx z1, z2
1192-
; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d
1183+
; VBITS_GE_256-NEXT: fmul z2.d, p0/m, z2.d, z3.d
11931184
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1194-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1185+
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0]
11951186
; VBITS_GE_256-NEXT: ret
11961187
;
11971188
; VBITS_GE_512-LABEL: fmul_v8f64:
@@ -1827,10 +1818,9 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 {
18271818
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
18281819
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
18291820
; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1830-
; VBITS_GE_256-NEXT: movprfx z1, z2
1831-
; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h
1821+
; VBITS_GE_256-NEXT: fsub z2.h, p0/m, z2.h, z3.h
18321822
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1833-
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1823+
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
18341824
; VBITS_GE_256-NEXT: ret
18351825
;
18361826
; VBITS_GE_512-LABEL: fsub_v32f16:
@@ -1926,10 +1916,9 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 {
19261916
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
19271917
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
19281918
; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1929-
; VBITS_GE_256-NEXT: movprfx z1, z2
1930-
; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s
1919+
; VBITS_GE_256-NEXT: fsub z2.s, p0/m, z2.s, z3.s
19311920
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1932-
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1921+
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0]
19331922
; VBITS_GE_256-NEXT: ret
19341923
;
19351924
; VBITS_GE_512-LABEL: fsub_v16f32:
@@ -2025,10 +2014,9 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 {
20252014
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
20262015
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
20272016
; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2028-
; VBITS_GE_256-NEXT: movprfx z1, z2
2029-
; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d
2017+
; VBITS_GE_256-NEXT: fsub z2.d, p0/m, z2.d, z3.d
20302018
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
2031-
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
2019+
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0]
20322020
; VBITS_GE_256-NEXT: ret
20332021
;
20342022
; VBITS_GE_512-LABEL: fsub_v8f64:

0 commit comments

Comments
 (0)