[LSR] Make OptimizeLoopTermCond able to handle some non-cmp conditions (#165590)

john-brawn-arm · web-flow · commit ccd4e7b1ed38 · 2025-12-03T15:28:46.000Z
Currently OptimizeLoopTermCond can only convert a cmp instruction to
using a postincrement induction variable, which means it can't handle
predicated loops where the termination condition comes from
get_active_lane_mask. Relax this restriction so that we can handle any
kind of instruction, though only if it's the instruction immediately
before the branch (except for possibly an extractelement).
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2195,8 +2195,8 @@ class LSRInstance {
   SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
 
   void OptimizeShadowIV();
-  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
-  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
+  Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
   void OptimizeLoopTermCond();
 
   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
@@ -2431,7 +2431,7 @@ void LSRInstance::OptimizeShadowIV() {
 
 /// If Cond has an operand that is an expression of an IV, set the IV user and
 /// stride information and return true, otherwise return false.
-bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
   for (IVStrideUse &U : IU)
     if (U.getUser() == Cond) {
       // NOTE: we could handle setcc instructions with multiple uses here, but
@@ -2491,7 +2491,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
 /// This function solves this problem by detecting this type of loop and
 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
 /// the instructions for the maximum computation.
-ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
   // Check that the loop matches the pattern we're looking for.
   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
       Cond->getPredicate() != CmpInst::ICMP_NE)
@@ -2635,15 +2635,22 @@ LSRInstance::OptimizeLoopTermCond() {
     // one register value.
 
     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-    if (!TermBr)
+    if (!TermBr || TermBr->isUnconditional())
       continue;
-    // FIXME: Overly conservative, termination condition could be an 'or' etc..
-    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+
+    Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
+    // If the argument to TermBr is an extractelement, then the source of that
+    // instruction is what's generated the condition.
+    auto *Extract = dyn_cast_or_null<ExtractElementInst>(Cond);
+    if (Extract)
+      Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
+    // FIXME: We could do more here, like handling logical operations where one
+    // side is a cmp that uses an induction variable.
+    if (!Cond)
       continue;
 
     // Search IVUsesByStride to find Cond's IVUse if there is one.
     IVStrideUse *CondUse = nullptr;
-    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
     if (!FindIVUserForCond(Cond, CondUse))
       continue;
 
@@ -2653,7 +2660,8 @@ LSRInstance::OptimizeLoopTermCond() {
     // One consequence of doing this now is that it disrupts the count-down
     // optimization. That's not always a bad thing though, because in such
     // cases it may still be worthwhile to avoid a max.
-    Cond = OptimizeMax(Cond, CondUse);
+    if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
+      Cond = OptimizeMax(Cmp, CondUse);
 
     // If this exiting block dominates the latch block, it may also use
     // the post-inc value if it won't be shared with other uses.
@@ -2718,13 +2726,14 @@ LSRInstance::OptimizeLoopTermCond() {
     // It's possible for the setcc instruction to be anywhere in the loop, and
     // possible for it to have multiple users.  If it is not immediately before
     // the exiting block branch, move it.
-    if (Cond->getNextNode() != TermBr) {
+    if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
+        !Extract) {
       if (Cond->hasOneUse()) {
         Cond->moveBefore(TermBr->getIterator());
       } else {
         // Clone the terminating condition and insert into the loopend.
-        ICmpInst *OldCond = Cond;
-        Cond = cast<ICmpInst>(Cond->clone());
+        Instruction *OldCond = Cond;
+        Cond = Cond->clone();
         Cond->setName(L->getHeader()->getName() + ".termcond");
         Cond->insertInto(ExitingBlock, TermBr->getIterator());
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -16,32 +16,32 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    mov w9, #100 // =0x64
+; CHECK-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x11, x9
+; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    mov z7.d, z1.d
 ; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
+; CHECK-NEXT:    add x8, x8, x10
 ; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0]
 ; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT:    add x1, x1, x10
-; CHECK-NEXT:    add x0, x0, x10
+; CHECK-NEXT:    add x1, x1, x11
+; CHECK-NEXT:    add x0, x0, x11
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    mov z1.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z0.d, p1/m, z6.d
-; CHECK-NEXT:    whilelo p1.d, x11, x8
-; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    whilelo p1.d, x8, x9
 ; CHECK-NEXT:    b.mi .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
@@ -213,19 +213,18 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, #100 // =0x64
-; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    mov w9, #100 // =0x64
+; CHECK-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cnth x11
-; CHECK-NEXT:    mov x12, x9
+; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z2.d }, p1/z, [x2]
+; CHECK-NEXT:    ld1w { z2.d }, p1/z, [x2, x8, lsl #2]
 ; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    add x2, x2, x11
+; CHECK-NEXT:    add x8, x8, x10
 ; CHECK-NEXT:    and z2.d, z2.d, #0xffffffff
 ; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
@@ -234,16 +233,15 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0]
 ; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT:    add x1, x1, x10
-; CHECK-NEXT:    add x0, x0, x10
+; CHECK-NEXT:    add x1, x1, x11
+; CHECK-NEXT:    add x0, x0, x11
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    mov z1.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z0.d, p1/m, z6.d
-; CHECK-NEXT:    whilelo p1.d, x12, x8
-; CHECK-NEXT:    add x12, x12, x9
+; CHECK-NEXT:    whilelo p1.d, x8, x9
 ; CHECK-NEXT:    b.mi .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -loop-reduce %s -S -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Tests where the loop termination condition is not generated by a compare.
+
+; The call to get.active.lane.mask in the loop should use the postincrement
+; value of %index.
+define void @lane_mask(ptr %dst, i64 %n) #0 {
+; CHECK-LABEL: define void @lane_mask(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP1]] = add i64 [[IV]], [[VSCALEX4]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]])
+; CHECK-NEXT:    [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vscalex4 = shl i64 %vscale, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
+  %iv.next = add i64 %iv, %vscalex4
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
+  %cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The store between the call and the branch shouldn't prevent the
+; postincement value from being used.
+define void @lane_mask_not_last(ptr %dst, i64 %n) #0 {
+; CHECK-LABEL: define void @lane_mask_not_last(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP0]] = add i64 [[IV]], [[VSCALEX4]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP0]], i64 [[N]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vscalex4 = shl i64 %vscale, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
+  %iv.next = add i64 %iv, %vscalex4
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
+  %cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The call to cmp_fn in the loop should use the postincrement value of %iv.
+define void @uses_cmp_fn(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @uses_cmp_fn(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    store i32 0, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV_NEXT]])
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
+  store i32 0, ptr %gep, align 4
+  %iv.next = add i64 %iv, 1
+  %cond = tail call i1 @cmp_fn(i64 %iv.next)
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The store between the call and the branch shouldn't prevent the
+; postincement value from being used.
+define void @uses_cmp_fn_not_last(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @uses_cmp_fn_not_last(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    [[LSR_IV]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]])
+; CHECK-NEXT:    store i32 0, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
+  %iv.next = add i64 %iv, 1
+  %cond = tail call i1 @cmp_fn(i64 %iv.next)
+  store i32 0, ptr %gep, align 4
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; cmp2 will use a preincrement induction variable as it isn't directly the loop
+; termination condition.
+; FIXME: We could potentially handle this by examining the operands of the 'and'
+; instruction.
+define void @cmp_and(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @cmp_and(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[LOOP]] ], [ [[DST]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[TMP0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr [[LSR_IV1]], align 8
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[VAL]], [[N]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[LSR_IV]], 0
+; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
+  %val = load i64, ptr %gep, align 8
+  %iv.next = add i64 %iv, 1
+  %cmp1 = icmp ne i64 %val, %n
+  %cmp2 = icmp ne i64 %iv.next, %n
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>)
+declare i1 @cmp_fn(i64)
+
+attributes #0 = { "target-features"="+sve2" }