From 1cf49b9986cfdac38067601eb997f34956e94699 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 11 Aug 2025 08:21:26 -0700
Subject: [PATCH 1/2] [LV] Pre-commit test for vectorisation of SAXPY unrolled
 by 5 (NFC).

This test contains a vectorisation example of a loop based on SAXPY
manually unrolled by five, as discussed in #148808.
---
 .../LoopVectorize/AArch64/saxpy-5.ll          | 248 ++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll b/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll
new file mode 100644
index 0000000000000..b74acfb9f05ea
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=loop-vectorize | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -S -passes=loop-vectorize -mattr=+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test contains an example of a loop based on SAXPY manually unrolled by
+; five:
+;
+;   void saxpy(long n, float a, float *x, float *y) {
+;     for (int i = 0; i < n; i += 5) {
+;       y[i] += a * x[i];
+;       y[i + 1] += a * x[i + 1];
+;       y[i + 2] += a * x[i + 2];
+;       y[i + 3] += a * x[i + 3];
+;       y[i + 4] += a * x[i + 4];
+;     }
+;   }
+;
+; Note: Although the loop is not vectorised with scalable vectors, we need +sve
+; for vectorisation due to an interaction with `prefersVectorizedAddressing'.
+
+define void @saxpy(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
+; CHECK-LABEL: define void @saxpy(
+; CHECK-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[XGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I1]]
+; CHECK-NEXT:    [[X1:%.*]] = load float, ptr [[XGEP1]], align 4
+; CHECK-NEXT:    [[AX1:%.*]] = fmul fast float [[X1]], [[A]]
+; CHECK-NEXT:    [[YGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I1]]
+; CHECK-NEXT:    [[Y1:%.*]] = load float, ptr [[YGEP1]], align 4
+; CHECK-NEXT:    [[AXPY1:%.*]] = fadd fast float [[Y1]], [[AX1]]
+; CHECK-NEXT:    store float [[AXPY1]], ptr [[YGEP1]], align 4
+; CHECK-NEXT:    [[I2:%.*]] = add nuw nsw i64 [[I1]], 1
+; CHECK-NEXT:    [[XGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I2]]
+; CHECK-NEXT:    [[X2:%.*]] = load float, ptr [[XGEP2]], align 4
+; CHECK-NEXT:    [[AX2:%.*]] = fmul fast float [[X2]], [[A]]
+; CHECK-NEXT:    [[YGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I2]]
+; CHECK-NEXT:    [[Y2:%.*]] = load float, ptr [[YGEP2]], align 4
+; CHECK-NEXT:    [[AXPY2:%.*]] = fadd fast float [[Y2]], [[AX2]]
+; CHECK-NEXT:    store float [[AXPY2]], ptr [[YGEP2]], align 4
+; CHECK-NEXT:    [[I3:%.*]] = add nuw nsw i64 [[I1]], 2
+; CHECK-NEXT:    [[XGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I3]]
+; CHECK-NEXT:    [[X3:%.*]] = load float, ptr [[XGEP3]], align 4
+; CHECK-NEXT:    [[AX3:%.*]] = fmul fast float [[X3]], [[A]]
+; CHECK-NEXT:    [[YGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I3]]
+; CHECK-NEXT:    [[Y3:%.*]] = load float, ptr [[YGEP3]], align 4
+; CHECK-NEXT:    [[AXPY3:%.*]] = fadd fast float [[Y3]], [[AX3]]
+; CHECK-NEXT:    store float [[AXPY3]], ptr [[YGEP3]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = add nuw nsw i64 [[I1]], 3
+; CHECK-NEXT:    [[XGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I4]]
+; CHECK-NEXT:    [[X4:%.*]] = load float, ptr [[XGEP4]], align 4
+; CHECK-NEXT:    [[AX4:%.*]] = fmul fast float [[X4]], [[A]]
+; CHECK-NEXT:    [[YGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I4]]
+; CHECK-NEXT:    [[Y4:%.*]] = load float, ptr [[YGEP4]], align 4
+; CHECK-NEXT:    [[AXPY4:%.*]] = fadd fast float [[Y4]], [[AX4]]
+; CHECK-NEXT:    store float [[AXPY4]], ptr [[YGEP4]], align 4
+; CHECK-NEXT:    [[I5:%.*]] = add nuw nsw i64 [[I1]], 4
+; CHECK-NEXT:    [[XGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I5]]
+; CHECK-NEXT:    [[X5:%.*]] = load float, ptr [[XGEP5]], align 4
+; CHECK-NEXT:    [[AX5:%.*]] = fmul fast float [[X5]], [[A]]
+; CHECK-NEXT:    [[YGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I5]]
+; CHECK-NEXT:    [[Y5:%.*]] = load float, ptr [[YGEP5]], align 4
+; CHECK-NEXT:    [[AXPY5:%.*]] = fadd fast float [[Y5]], [[AX5]]
+; CHECK-NEXT:    store float [[AXPY5]], ptr [[YGEP5]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I1]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N]], [[I_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; CHECK-SVE-LABEL: define void @saxpy(
+; CHECK-SVE-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*:]]
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-SVE-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-SVE:       [[LOOP_PREHEADER]]:
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -1
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 5
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-SVE:       [[VECTOR_PH]]:
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-SVE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x float>, ptr [[TMP5]], align 4
+; CHECK-SVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-SVE-NEXT:    [[WIDE_VEC5:%.*]] = load <10 x float>, ptr [[TMP7]], align 4
+; CHECK-SVE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
+; CHECK-SVE-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = fadd fast <2 x float> [[STRIDED_VEC6]], [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = fadd fast <2 x float> [[STRIDED_VEC7]], [[TMP9]]
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = fadd fast <2 x float> [[STRIDED_VEC8]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = fadd fast <2 x float> [[STRIDED_VEC9]], [[TMP13]]
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = fadd fast <2 x float> [[STRIDED_VEC10]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-SVE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP19]], <8 x float> [[TMP20]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-SVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x float> [[TMP21]], <10 x float> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
+; CHECK-SVE-NEXT:    store <10 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-SVE:       [[SCALAR_PH]]:
+; CHECK-SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-SVE-NEXT:    br label %[[LOOP:.*]]
+; CHECK-SVE:       [[LOOP]]:
+; CHECK-SVE-NEXT:    [[I1:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-SVE-NEXT:    [[XGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I1]]
+; CHECK-SVE-NEXT:    [[X1:%.*]] = load float, ptr [[XGEP1]], align 4
+; CHECK-SVE-NEXT:    [[AX1:%.*]] = fmul fast float [[X1]], [[A]]
+; CHECK-SVE-NEXT:    [[YGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I1]]
+; CHECK-SVE-NEXT:    [[Y1:%.*]] = load float, ptr [[YGEP1]], align 4
+; CHECK-SVE-NEXT:    [[AXPY1:%.*]] = fadd fast float [[Y1]], [[AX1]]
+; CHECK-SVE-NEXT:    store float [[AXPY1]], ptr [[YGEP1]], align 4
+; CHECK-SVE-NEXT:    [[I2:%.*]] = add nuw nsw i64 [[I1]], 1
+; CHECK-SVE-NEXT:    [[XGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I2]]
+; CHECK-SVE-NEXT:    [[X2:%.*]] = load float, ptr [[XGEP2]], align 4
+; CHECK-SVE-NEXT:    [[AX2:%.*]] = fmul fast float [[X2]], [[A]]
+; CHECK-SVE-NEXT:    [[YGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I2]]
+; CHECK-SVE-NEXT:    [[Y2:%.*]] = load float, ptr [[YGEP2]], align 4
+; CHECK-SVE-NEXT:    [[AXPY2:%.*]] = fadd fast float [[Y2]], [[AX2]]
+; CHECK-SVE-NEXT:    store float [[AXPY2]], ptr [[YGEP2]], align 4
+; CHECK-SVE-NEXT:    [[I3:%.*]] = add nuw nsw i64 [[I1]], 2
+; CHECK-SVE-NEXT:    [[XGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I3]]
+; CHECK-SVE-NEXT:    [[X3:%.*]] = load float, ptr [[XGEP3]], align 4
+; CHECK-SVE-NEXT:    [[AX3:%.*]] = fmul fast float [[X3]], [[A]]
+; CHECK-SVE-NEXT:    [[YGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I3]]
+; CHECK-SVE-NEXT:    [[Y3:%.*]] = load float, ptr [[YGEP3]], align 4
+; CHECK-SVE-NEXT:    [[AXPY3:%.*]] = fadd fast float [[Y3]], [[AX3]]
+; CHECK-SVE-NEXT:    store float [[AXPY3]], ptr [[YGEP3]], align 4
+; CHECK-SVE-NEXT:    [[I4:%.*]] = add nuw nsw i64 [[I1]], 3
+; CHECK-SVE-NEXT:    [[XGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I4]]
+; CHECK-SVE-NEXT:    [[X4:%.*]] = load float, ptr [[XGEP4]], align 4
+; CHECK-SVE-NEXT:    [[AX4:%.*]] = fmul fast float [[X4]], [[A]]
+; CHECK-SVE-NEXT:    [[YGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I4]]
+; CHECK-SVE-NEXT:    [[Y4:%.*]] = load float, ptr [[YGEP4]], align 4
+; CHECK-SVE-NEXT:    [[AXPY4:%.*]] = fadd fast float [[Y4]], [[AX4]]
+; CHECK-SVE-NEXT:    store float [[AXPY4]], ptr [[YGEP4]], align 4
+; CHECK-SVE-NEXT:    [[I5:%.*]] = add nuw nsw i64 [[I1]], 4
+; CHECK-SVE-NEXT:    [[XGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I5]]
+; CHECK-SVE-NEXT:    [[X5:%.*]] = load float, ptr [[XGEP5]], align 4
+; CHECK-SVE-NEXT:    [[AX5:%.*]] = fmul fast float [[X5]], [[A]]
+; CHECK-SVE-NEXT:    [[YGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I5]]
+; CHECK-SVE-NEXT:    [[Y5:%.*]] = load float, ptr [[YGEP5]], align 4
+; CHECK-SVE-NEXT:    [[AXPY5:%.*]] = fadd fast float [[Y5]], [[AX5]]
+; CHECK-SVE-NEXT:    store float [[AXPY5]], ptr [[YGEP5]], align 4
+; CHECK-SVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I1]], 5
+; CHECK-SVE-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N]], [[I_NEXT]]
+; CHECK-SVE-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-SVE:       [[EXIT_LOOPEXIT]]:
+; CHECK-SVE-NEXT:    br label %[[EXIT]]
+; CHECK-SVE:       [[EXIT]]:
+; CHECK-SVE-NEXT:    ret void
+;
+entry:
+  %0 = icmp sgt i64 %n, 0
+  br i1 %0, label %loop, label %exit
+
+loop:
+  %i1 = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %xgep1 = getelementptr inbounds nuw float, ptr %x, i64 %i1
+  %x1 = load float, ptr %xgep1, align 4
+  %ax1 = fmul fast float %x1, %a
+  %ygep1 = getelementptr inbounds nuw float, ptr %y, i64 %i1
+  %y1 = load float, ptr %ygep1, align 4
+  %axpy1 = fadd fast float %y1, %ax1
+  store float %axpy1, ptr %ygep1, align 4
+  %i2 = add nuw nsw i64 %i1, 1
+  %xgep2 = getelementptr inbounds nuw float, ptr %x, i64 %i2
+  %x2 = load float, ptr %xgep2, align 4
+  %ax2 = fmul fast float %x2, %a
+  %ygep2 = getelementptr inbounds nuw float, ptr %y, i64 %i2
+  %y2 = load float, ptr %ygep2, align 4
+  %axpy2 = fadd fast float %y2, %ax2
+  store float %axpy2, ptr %ygep2, align 4
+  %i3 = add nuw nsw i64 %i1, 2
+  %xgep3 = getelementptr inbounds nuw float, ptr %x, i64 %i3
+  %x3 = load float, ptr %xgep3, align 4
+  %ax3 = fmul fast float %x3, %a
+  %ygep3 = getelementptr inbounds nuw float, ptr %y, i64 %i3
+  %y3 = load float, ptr %ygep3, align 4
+  %axpy3 = fadd fast float %y3, %ax3
+  store float %axpy3, ptr %ygep3, align 4
+  %i4 = add nuw nsw i64 %i1, 3
+  %xgep4 = getelementptr inbounds nuw float, ptr %x, i64 %i4
+  %x4 = load float, ptr %xgep4, align 4
+  %ax4 = fmul fast float %x4, %a
+  %ygep4 = getelementptr inbounds nuw float, ptr %y, i64 %i4
+  %y4 = load float, ptr %ygep4, align 4
+  %axpy4 = fadd fast float %y4, %ax4
+  store float %axpy4, ptr %ygep4, align 4
+  %i5 = add nuw nsw i64 %i1, 4
+  %xgep5 = getelementptr inbounds nuw float, ptr %x, i64 %i5
+  %x5 = load float, ptr %xgep5, align 4
+  %ax5 = fmul fast float %x5, %a
+  %ygep5 = getelementptr inbounds nuw float, ptr %y, i64 %i5
+  %y5 = load float, ptr %ygep5, align 4
+  %axpy5 = fadd fast float %y5, %ax5
+  store float %axpy5, ptr %ygep5, align 4
+  %i.next = add nuw nsw i64 %i1, 5
+  %cmp = icmp sgt i64 %n, %i.next
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+;.
+; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From e3ef062f84bc5f96368f805aac5356c1b1481572 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Tue, 26 Aug 2025 03:59:33 -0700
Subject: [PATCH 2/2] Move test to interleave_vec.ll.

---
 .../LoopVectorize/AArch64/saxpy-5.ll          | 248 ------------------
 .../PhaseOrdering/AArch64/interleave_vec.ll   | 129 +++++++++
 2 files changed, 129 insertions(+), 248 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll b/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll
deleted file mode 100644
index b74acfb9f05ea..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/saxpy-5.ll
+++ /dev/null
@@ -1,248 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -passes=loop-vectorize | FileCheck %s --check-prefix=CHECK
-; RUN: opt < %s -S -passes=loop-vectorize -mattr=+sve | FileCheck %s --check-prefix=CHECK-SVE
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-target triple = "aarch64-unknown-linux-gnu"
-
-; This test contains an example of a loop based on SAXPY manually unrolled by
-; five:
-;
-;   void saxpy(long n, float a, float *x, float *y) {
-;     for (int i = 0; i < n; i += 5) {
-;       y[i] += a * x[i];
-;       y[i + 1] += a * x[i + 1];
-;       y[i + 2] += a * x[i + 2];
-;       y[i + 3] += a * x[i + 3];
-;       y[i + 4] += a * x[i + 4];
-;     }
-;   }
-;
-; Note: Although the loop is not vectorised with scalable vectors, we need +sve
-; for vectorisation due to an interaction with `prefersVectorizedAddressing'.
-
-define void @saxpy(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
-; CHECK-LABEL: define void @saxpy(
-; CHECK-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK:       [[LOOP_PREHEADER]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[XGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I1]]
-; CHECK-NEXT:    [[X1:%.*]] = load float, ptr [[XGEP1]], align 4
-; CHECK-NEXT:    [[AX1:%.*]] = fmul fast float [[X1]], [[A]]
-; CHECK-NEXT:    [[YGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I1]]
-; CHECK-NEXT:    [[Y1:%.*]] = load float, ptr [[YGEP1]], align 4
-; CHECK-NEXT:    [[AXPY1:%.*]] = fadd fast float [[Y1]], [[AX1]]
-; CHECK-NEXT:    store float [[AXPY1]], ptr [[YGEP1]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = add nuw nsw i64 [[I1]], 1
-; CHECK-NEXT:    [[XGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I2]]
-; CHECK-NEXT:    [[X2:%.*]] = load float, ptr [[XGEP2]], align 4
-; CHECK-NEXT:    [[AX2:%.*]] = fmul fast float [[X2]], [[A]]
-; CHECK-NEXT:    [[YGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I2]]
-; CHECK-NEXT:    [[Y2:%.*]] = load float, ptr [[YGEP2]], align 4
-; CHECK-NEXT:    [[AXPY2:%.*]] = fadd fast float [[Y2]], [[AX2]]
-; CHECK-NEXT:    store float [[AXPY2]], ptr [[YGEP2]], align 4
-; CHECK-NEXT:    [[I3:%.*]] = add nuw nsw i64 [[I1]], 2
-; CHECK-NEXT:    [[XGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I3]]
-; CHECK-NEXT:    [[X3:%.*]] = load float, ptr [[XGEP3]], align 4
-; CHECK-NEXT:    [[AX3:%.*]] = fmul fast float [[X3]], [[A]]
-; CHECK-NEXT:    [[YGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I3]]
-; CHECK-NEXT:    [[Y3:%.*]] = load float, ptr [[YGEP3]], align 4
-; CHECK-NEXT:    [[AXPY3:%.*]] = fadd fast float [[Y3]], [[AX3]]
-; CHECK-NEXT:    store float [[AXPY3]], ptr [[YGEP3]], align 4
-; CHECK-NEXT:    [[I4:%.*]] = add nuw nsw i64 [[I1]], 3
-; CHECK-NEXT:    [[XGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I4]]
-; CHECK-NEXT:    [[X4:%.*]] = load float, ptr [[XGEP4]], align 4
-; CHECK-NEXT:    [[AX4:%.*]] = fmul fast float [[X4]], [[A]]
-; CHECK-NEXT:    [[YGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I4]]
-; CHECK-NEXT:    [[Y4:%.*]] = load float, ptr [[YGEP4]], align 4
-; CHECK-NEXT:    [[AXPY4:%.*]] = fadd fast float [[Y4]], [[AX4]]
-; CHECK-NEXT:    store float [[AXPY4]], ptr [[YGEP4]], align 4
-; CHECK-NEXT:    [[I5:%.*]] = add nuw nsw i64 [[I1]], 4
-; CHECK-NEXT:    [[XGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I5]]
-; CHECK-NEXT:    [[X5:%.*]] = load float, ptr [[XGEP5]], align 4
-; CHECK-NEXT:    [[AX5:%.*]] = fmul fast float [[X5]], [[A]]
-; CHECK-NEXT:    [[YGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I5]]
-; CHECK-NEXT:    [[Y5:%.*]] = load float, ptr [[YGEP5]], align 4
-; CHECK-NEXT:    [[AXPY5:%.*]] = fadd fast float [[Y5]], [[AX5]]
-; CHECK-NEXT:    store float [[AXPY5]], ptr [[YGEP5]], align 4
-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I1]], 5
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N]], [[I_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
-; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-; CHECK-SVE-LABEL: define void @saxpy(
-; CHECK-SVE-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly [[X:%.*]], ptr noalias [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-SVE-NEXT:  [[ENTRY:.*:]]
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
-; CHECK-SVE-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
-; CHECK-SVE:       [[LOOP_PREHEADER]]:
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -1
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 5
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
-; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK-SVE:       [[VECTOR_PH]]:
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
-; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE:       [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-SVE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x float>, ptr [[TMP5]], align 4
-; CHECK-SVE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
-; CHECK-SVE-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
-; CHECK-SVE-NEXT:    [[WIDE_VEC5:%.*]] = load <10 x float>, ptr [[TMP7]], align 4
-; CHECK-SVE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 1, i32 6>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 2, i32 7>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 3, i32 8>
-; CHECK-SVE-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 4, i32 9>
-; CHECK-SVE-NEXT:    [[TMP8:%.*]] = fadd fast <2 x float> [[STRIDED_VEC6]], [[TMP6]]
-; CHECK-SVE-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = fadd fast <2 x float> [[STRIDED_VEC7]], [[TMP9]]
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = fadd fast <2 x float> [[STRIDED_VEC8]], [[TMP11]]
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = fadd fast <2 x float> [[STRIDED_VEC9]], [[TMP13]]
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = fadd fast <2 x float> [[STRIDED_VEC10]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-SVE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP19]], <8 x float> [[TMP20]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-SVE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x float> [[TMP21]], <10 x float> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; CHECK-SVE-NEXT:    store <10 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-SVE:       [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
-; CHECK-SVE:       [[SCALAR_PH]]:
-; CHECK-SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; CHECK-SVE-NEXT:    br label %[[LOOP:.*]]
-; CHECK-SVE:       [[LOOP]]:
-; CHECK-SVE-NEXT:    [[I1:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-SVE-NEXT:    [[XGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I1]]
-; CHECK-SVE-NEXT:    [[X1:%.*]] = load float, ptr [[XGEP1]], align 4
-; CHECK-SVE-NEXT:    [[AX1:%.*]] = fmul fast float [[X1]], [[A]]
-; CHECK-SVE-NEXT:    [[YGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I1]]
-; CHECK-SVE-NEXT:    [[Y1:%.*]] = load float, ptr [[YGEP1]], align 4
-; CHECK-SVE-NEXT:    [[AXPY1:%.*]] = fadd fast float [[Y1]], [[AX1]]
-; CHECK-SVE-NEXT:    store float [[AXPY1]], ptr [[YGEP1]], align 4
-; CHECK-SVE-NEXT:    [[I2:%.*]] = add nuw nsw i64 [[I1]], 1
-; CHECK-SVE-NEXT:    [[XGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I2]]
-; CHECK-SVE-NEXT:    [[X2:%.*]] = load float, ptr [[XGEP2]], align 4
-; CHECK-SVE-NEXT:    [[AX2:%.*]] = fmul fast float [[X2]], [[A]]
-; CHECK-SVE-NEXT:    [[YGEP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I2]]
-; CHECK-SVE-NEXT:    [[Y2:%.*]] = load float, ptr [[YGEP2]], align 4
-; CHECK-SVE-NEXT:    [[AXPY2:%.*]] = fadd fast float [[Y2]], [[AX2]]
-; CHECK-SVE-NEXT:    store float [[AXPY2]], ptr [[YGEP2]], align 4
-; CHECK-SVE-NEXT:    [[I3:%.*]] = add nuw nsw i64 [[I1]], 2
-; CHECK-SVE-NEXT:    [[XGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I3]]
-; CHECK-SVE-NEXT:    [[X3:%.*]] = load float, ptr [[XGEP3]], align 4
-; CHECK-SVE-NEXT:    [[AX3:%.*]] = fmul fast float [[X3]], [[A]]
-; CHECK-SVE-NEXT:    [[YGEP3:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I3]]
-; CHECK-SVE-NEXT:    [[Y3:%.*]] = load float, ptr [[YGEP3]], align 4
-; CHECK-SVE-NEXT:    [[AXPY3:%.*]] = fadd fast float [[Y3]], [[AX3]]
-; CHECK-SVE-NEXT:    store float [[AXPY3]], ptr [[YGEP3]], align 4
-; CHECK-SVE-NEXT:    [[I4:%.*]] = add nuw nsw i64 [[I1]], 3
-; CHECK-SVE-NEXT:    [[XGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I4]]
-; CHECK-SVE-NEXT:    [[X4:%.*]] = load float, ptr [[XGEP4]], align 4
-; CHECK-SVE-NEXT:    [[AX4:%.*]] = fmul fast float [[X4]], [[A]]
-; CHECK-SVE-NEXT:    [[YGEP4:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I4]]
-; CHECK-SVE-NEXT:    [[Y4:%.*]] = load float, ptr [[YGEP4]], align 4
-; CHECK-SVE-NEXT:    [[AXPY4:%.*]] = fadd fast float [[Y4]], [[AX4]]
-; CHECK-SVE-NEXT:    store float [[AXPY4]], ptr [[YGEP4]], align 4
-; CHECK-SVE-NEXT:    [[I5:%.*]] = add nuw nsw i64 [[I1]], 4
-; CHECK-SVE-NEXT:    [[XGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I5]]
-; CHECK-SVE-NEXT:    [[X5:%.*]] = load float, ptr [[XGEP5]], align 4
-; CHECK-SVE-NEXT:    [[AX5:%.*]] = fmul fast float [[X5]], [[A]]
-; CHECK-SVE-NEXT:    [[YGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I5]]
-; CHECK-SVE-NEXT:    [[Y5:%.*]] = load float, ptr [[YGEP5]], align 4
-; CHECK-SVE-NEXT:    [[AXPY5:%.*]] = fadd fast float [[Y5]], [[AX5]]
-; CHECK-SVE-NEXT:    store float [[AXPY5]], ptr [[YGEP5]], align 4
-; CHECK-SVE-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I1]], 5
-; CHECK-SVE-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N]], [[I_NEXT]]
-; CHECK-SVE-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK-SVE:       [[EXIT_LOOPEXIT]]:
-; CHECK-SVE-NEXT:    br label %[[EXIT]]
-; CHECK-SVE:       [[EXIT]]:
-; CHECK-SVE-NEXT:    ret void
-;
-entry:
-  %0 = icmp sgt i64 %n, 0
-  br i1 %0, label %loop, label %exit
-
-loop:
-  %i1 = phi i64 [ %i.next, %loop ], [ 0, %entry ]
-  %xgep1 = getelementptr inbounds nuw float, ptr %x, i64 %i1
-  %x1 = load float, ptr %xgep1, align 4
-  %ax1 = fmul fast float %x1, %a
-  %ygep1 = getelementptr inbounds nuw float, ptr %y, i64 %i1
-  %y1 = load float, ptr %ygep1, align 4
-  %axpy1 = fadd fast float %y1, %ax1
-  store float %axpy1, ptr %ygep1, align 4
-  %i2 = add nuw nsw i64 %i1, 1
-  %xgep2 = getelementptr inbounds nuw float, ptr %x, i64 %i2
-  %x2 = load float, ptr %xgep2, align 4
-  %ax2 = fmul fast float %x2, %a
-  %ygep2 = getelementptr inbounds nuw float, ptr %y, i64 %i2
-  %y2 = load float, ptr %ygep2, align 4
-  %axpy2 = fadd fast float %y2, %ax2
-  store float %axpy2, ptr %ygep2, align 4
-  %i3 = add nuw nsw i64 %i1, 2
-  %xgep3 = getelementptr inbounds nuw float, ptr %x, i64 %i3
-  %x3 = load float, ptr %xgep3, align 4
-  %ax3 = fmul fast float %x3, %a
-  %ygep3 = getelementptr inbounds nuw float, ptr %y, i64 %i3
-  %y3 = load float, ptr %ygep3, align 4
-  %axpy3 = fadd fast float %y3, %ax3
-  store float %axpy3, ptr %ygep3, align 4
-  %i4 = add nuw nsw i64 %i1, 3
-  %xgep4 = getelementptr inbounds nuw float, ptr %x, i64 %i4
-  %x4 = load float, ptr %xgep4, align 4
-  %ax4 = fmul fast float %x4, %a
-  %ygep4 = getelementptr inbounds nuw float, ptr %y, i64 %i4
-  %y4 = load float, ptr %ygep4, align 4
-  %axpy4 = fadd fast float %y4, %ax4
-  store float %axpy4, ptr %ygep4, align 4
-  %i5 = add nuw nsw i64 %i1, 4
-  %xgep5 = getelementptr inbounds nuw float, ptr %x, i64 %i5
-  %x5 = load float, ptr %xgep5, align 4
-  %ax5 = fmul fast float %x5, %a
-  %ygep5 = getelementptr inbounds nuw float, ptr %y, i64 %i5
-  %y5 = load float, ptr %ygep5, align 4
-  %axpy5 = fadd fast float %y5, %ax5
-  store float %axpy5, ptr %ygep5, align 4
-  %i.next = add nuw nsw i64 %i1, 5
-  %cmp = icmp sgt i64 %n, %i.next
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret void
-}
-;.
-; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index afe7d7498fc1d..2dceb27165c4d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -1013,6 +1013,133 @@ for.inc9:                                         ; preds = %for.end
 for.end11:                                        ; preds = %for.cond
   ret void
 }
+
+; This test contains an example of a SAXPY loop manually unrolled by five:
+;
+;   void saxpy(long n, float a, float *x, float *y) {
+;     for (int i = 0; i < n; i += 5) {
+;       y[i] += a * x[i];
+;       y[i + 1] += a * x[i + 1];
+;       y[i + 2] += a * x[i + 2];
+;       y[i + 3] += a * x[i + 3];
+;       y[i + 4] += a * x[i + 4];
+;     }
+;   }
+;
+define void @saxpy_5(i64 %n, float %a, ptr readonly %x, ptr noalias %y) {
+; CHECK-LABEL: define void @saxpy_5(
+; CHECK-SAME: i64 [[N:%.*]], float [[A:%.*]], ptr readonly captures(none) [[X:%.*]], ptr noalias captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 5
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_PREHEADER11:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775806
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <10 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <10 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <10 x float>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <10 x float> [[WIDE_VEC]], [[TMP5]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <10 x float> [[WIDE_VEC5]], [[TMP8]]
+; CHECK-NEXT:    store <10 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[LOOP_PREHEADER11]]
+; CHECK:       [[LOOP_PREHEADER11]]:
+; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[I1_PH]], %[[LOOP_PREHEADER11]] ]
+; CHECK-NEXT:    [[XGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I1]]
+; CHECK-NEXT:    [[YGEP1:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[XGEP1]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <4 x float> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[YGEP1]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    store <4 x float> [[TMP15]], ptr [[YGEP1]], align 4
+; CHECK-NEXT:    [[I5:%.*]] = add nuw nsw i64 [[I1]], 4
+; CHECK-NEXT:    [[XGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[I5]]
+; CHECK-NEXT:    [[X5:%.*]] = load float, ptr [[XGEP5]], align 4
+; CHECK-NEXT:    [[AX5:%.*]] = fmul fast float [[X5]], [[A]]
+; CHECK-NEXT:    [[YGEP5:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[I5]]
+; CHECK-NEXT:    [[Y5:%.*]] = load float, ptr [[YGEP5]], align 4
+; CHECK-NEXT:    [[AXPY5:%.*]] = fadd fast float [[Y5]], [[AX5]]
+; CHECK-NEXT:    store float [[AXPY5]], ptr [[YGEP5]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I1]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[N]], [[I_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = icmp sgt i64 %n, 0
+  br i1 %0, label %loop, label %exit
+
+loop:
+  %i1 = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %xgep1 = getelementptr inbounds nuw float, ptr %x, i64 %i1
+  %x1 = load float, ptr %xgep1, align 4
+  %ax1 = fmul fast float %x1, %a
+  %ygep1 = getelementptr inbounds nuw float, ptr %y, i64 %i1
+  %y1 = load float, ptr %ygep1, align 4
+  %axpy1 = fadd fast float %y1, %ax1
+  store float %axpy1, ptr %ygep1, align 4
+  %i2 = add nuw nsw i64 %i1, 1
+  %xgep2 = getelementptr inbounds nuw float, ptr %x, i64 %i2
+  %x2 = load float, ptr %xgep2, align 4
+  %ax2 = fmul fast float %x2, %a
+  %ygep2 = getelementptr inbounds nuw float, ptr %y, i64 %i2
+  %y2 = load float, ptr %ygep2, align 4
+  %axpy2 = fadd fast float %y2, %ax2
+  store float %axpy2, ptr %ygep2, align 4
+  %i3 = add nuw nsw i64 %i1, 2
+  %xgep3 = getelementptr inbounds nuw float, ptr %x, i64 %i3
+  %x3 = load float, ptr %xgep3, align 4
+  %ax3 = fmul fast float %x3, %a
+  %ygep3 = getelementptr inbounds nuw float, ptr %y, i64 %i3
+  %y3 = load float, ptr %ygep3, align 4
+  %axpy3 = fadd fast float %y3, %ax3
+  store float %axpy3, ptr %ygep3, align 4
+  %i4 = add nuw nsw i64 %i1, 3
+  %xgep4 = getelementptr inbounds nuw float, ptr %x, i64 %i4
+  %x4 = load float, ptr %xgep4, align 4
+  %ax4 = fmul fast float %x4, %a
+  %ygep4 = getelementptr inbounds nuw float, ptr %y, i64 %i4
+  %y4 = load float, ptr %ygep4, align 4
+  %axpy4 = fadd fast float %y4, %ax4
+  store float %axpy4, ptr %ygep4, align 4
+  %i5 = add nuw nsw i64 %i1, 4
+  %xgep5 = getelementptr inbounds nuw float, ptr %x, i64 %i5
+  %x5 = load float, ptr %xgep5, align 4
+  %ax5 = fmul fast float %x5, %a
+  %ygep5 = getelementptr inbounds nuw float, ptr %y, i64 %i5
+  %y5 = load float, ptr %ygep5, align 4
+  %axpy5 = fadd fast float %y5, %ax5
+  store float %axpy5, ptr %ygep5, align 4
+  %i.next = add nuw nsw i64 %i1, 5
+  %cmp = icmp sgt i64 %n, %i.next
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1023,4 +1150,6 @@ for.end11:                                        ; preds = %for.cond
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
 ;.