llvm
diff --git a/‎llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp‎
Lines changed: 73 additions & 13 deletions b/‎llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp‎
Lines changed: 73 additions & 13 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll‎
Lines changed: 3 additions & 4 deletions b/‎llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll‎
Lines changed: 9 additions & 13 deletions b/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll‎
Lines changed: 3 additions & 5 deletions b/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll‎
Lines changed: 4 additions & 5 deletions b/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll‎
Lines changed: 3 additions & 4 deletions b/‎llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll‎
Lines changed: 2 additions & 3 deletions b/‎llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll‎
Lines changed: 24 additions & 36 deletions b/‎llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll‎
Lines changed: 24 additions & 36 deletions
@@ -1123,24 +1123,85 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
-// where a consecutive multi-vector tuple is constructed from the same indices
-// of multiple strided loads. This may still result in unnecessary copies
-// between the loads and the tuple. Here we try to return a hint to assign the
-// contiguous ZPRMulReg starting at the same register as the first operand of
-// the pseudo, which should be a subregister of the first strided load.
+// We add regalloc hints for different cases:
+// * Choosing a better destination operand for predicated SVE instructions
+//   where the inactive lanes are undef, by choosing a register that is not
+//   unique to the other operands of the instruction.
 //
-// For example, if the first strided load has been assigned $z16_z20_z24_z28
-// and the operands of the pseudo are each accessing subregister zsub2, we
-// should look through through Order to find a contiguous register which
-// begins with $z24 (i.e. $z24_z25_z26_z27).
+// * Improve register allocation for SME multi-vector instructions where we can
+//   benefit from the strided- and contiguous register multi-vector tuples.
 //
+//   Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
+//   allocation where a consecutive multi-vector tuple is constructed from the
+//   same indices of multiple strided loads. This may still result in
+//   unnecessary copies between the loads and the tuple. Here we try to return a
+//   hint to assign the contiguous ZPRMulReg starting at the same register as
+//   the first operand of the pseudo, which should be a subregister of the first
+//   strided load.
+//
+//   For example, if the first strided load has been assigned $z16_z20_z24_z28
+//   and the operands of the pseudo are each accessing subregister zsub2, we
+//   should look through through Order to find a contiguous register which
+//   begins with $z24 (i.e. $z24_z25_z26_z27).
 bool AArch64RegisterInfo::getRegAllocationHints(
     Register VirtReg, ArrayRef<MCPhysReg> Order,
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
-
   auto &ST = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // For predicated SVE instructions where the inactive lanes are undef,
+  // pick a destination register that is not unique to avoid introducing
+  // a movprfx.
+  const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
+  if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
+    for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
+      const MachineInstr &Def = *DefOp.getParent();
+      if (DefOp.isImplicit() ||
+          (TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) !=
+              AArch64::FalseLanesUndef)
+        continue;
+
+      unsigned InstFlags =
+          TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags;
+
+      for (MCPhysReg R : Order) {
+        auto AddHintIfSuitable = [&](MCPhysReg R, const MachineOperand &MO) {
+          // R is a suitable register hint if there exists an operand for the
+          // instruction that is not yet allocated a register or if R matches
+          // one of the other source operands.
+          if (!VRM->hasPhys(MO.getReg()) || VRM->getPhys(MO.getReg()) == R)
+            Hints.push_back(R);
+        };
+
+        switch (InstFlags & AArch64::DestructiveInstTypeMask) {
+        default:
+          break;
+        case AArch64::DestructiveTernaryCommWithRev:
+          AddHintIfSuitable(R, Def.getOperand(2));
+          AddHintIfSuitable(R, Def.getOperand(3));
+          AddHintIfSuitable(R, Def.getOperand(4));
+          break;
+        case AArch64::DestructiveBinaryComm:
+        case AArch64::DestructiveBinaryCommWithRev:
+          AddHintIfSuitable(R, Def.getOperand(2));
+          AddHintIfSuitable(R, Def.getOperand(3));
+          break;
+        case AArch64::DestructiveBinary:
+        case AArch64::DestructiveBinaryImm:
+          AddHintIfSuitable(R, Def.getOperand(2));
+          break;
+        }
+      }
+    }
+
+    if (Hints.size())
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+  }
+
   if (!ST.hasSME() || !ST.isStreaming())
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
@@ -1153,8 +1214,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
   // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
   // instructions over reducing the number of clobbered callee-save registers,
   // so we add the strided registers as a hint.
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+  unsigned RegID = RegRC->getID();
   if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
       RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
 
 
@@ -52,12 +52,11 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    sub v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    sub v0.2d, v2.2d, v0.2d
 ; CHECK-NEXT:    ret
   %div = sdiv <2 x i64> %a, %b
   %mul = mul <2 x i64> %c, %d
 
@@ -14,13 +14,12 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmul z7.d, z0.d, z1.d
 ; CHECK-NEXT:    fmul z1.d, z6.d, z1.d
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z2.d
+; CHECK-NEXT:    fmad z6.d, p0/m, z2.d, z7.d
 ; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    fadd z2.d, z2.d, z0.d
-; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z6.d, z1.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
@@ -225,17 +224,14 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    fmul z1.d, z25.d, z1.d
 ; CHECK-NEXT:    fmul z3.d, z4.d, z24.d
 ; CHECK-NEXT:    fmul z24.d, z5.d, z24.d
-; CHECK-NEXT:    movprfx z7, z26
-; CHECK-NEXT:    fmla z7.d, p0/m, z25.d, z2.d
+; CHECK-NEXT:    fmad z25.d, p0/m, z2.d, z26.d
 ; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fmla z1.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    movprfx z2, z24
-; CHECK-NEXT:    fnmls z2.d, p0/m, z4.d, z6.d
-; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
-; CHECK-NEXT:    fadd z1.d, z7.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    fnmsb z4.d, p0/m, z6.d, z24.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z4.d
+; CHECK-NEXT:    fadd z2.d, z25.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
 
@@ -200,12 +200,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
 ; CHECK-NEXT:    fmul z25.d, z24.d, z25.d
 ; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    movprfx z24, z25
-; CHECK-NEXT:    fmla z24.d, p0/m, z26.d, z1.d
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fmla z6.d, p0/m, z5.d, z4.d
+; CHECK-NEXT:    fmla z25.d, p0/m, z26.d, z1.d
+; CHECK-NEXT:    fmla z25.d, p0/m, z5.d, z4.d
 ; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z4.d
-; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z6.d
+; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z25.d
 ; CHECK-NEXT:    fmsb z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 
@@ -17,11 +17,10 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
 
@@ -18,11 +18,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp2 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    mul z5.d, z2.d, z0.d
 ; CHECK-NEXT:    mul z2.d, z2.d, z4.d
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    mla z3.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    mad z4.d, p0/m, z1.d, z5.d
 ; CHECK-NEXT:    msb z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    zip2 z1.d, z0.d, z4.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
 
@@ -1148,11 +1148,10 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    lsl z2.d, p0/m, z2.d, z5.d
+; CHECK-NEXT:    lslr z5.d, p0/m, z5.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    orr z0.d, z4.d, z0.d
-; CHECK-NEXT:    orr z1.d, z2.d, z1.d
+; CHECK-NEXT:    orr z1.d, z5.d, z1.d
 ; CHECK-NEXT:    ret
   %fshl = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   ret <vscale x 4 x i64> %fshl
 
@@ -55,10 +55,9 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v32f16:
@@ -154,10 +153,9 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fadd z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v16f32:
@@ -253,10 +251,9 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fadd z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v8f64:
@@ -660,10 +657,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
+; VBITS_GE_256-NEXT:    fmad z3.h, p0/m, z4.h, z5.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v32f16:
@@ -771,10 +767,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v16f32:
@@ -881,10 +876,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
+; VBITS_GE_256-NEXT:    fmad z3.d, p0/m, z4.d, z5.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v8f64:
@@ -990,10 +984,9 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fmul z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v32f16:
@@ -1089,10 +1082,9 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v16f32:
@@ -1188,10 +1180,9 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fmul z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v8f64:
@@ -1827,10 +1818,9 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fsub z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v32f16:
@@ -1926,10 +1916,9 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fsub z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v16f32:
@@ -2025,10 +2014,9 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fsub z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v8f64: