Skip to content

Commit ccd4e7b

Browse files
[LSR] Make OptimizeLoopTermCond able to handle some non-cmp conditions (#165590)
Currently OptimizeLoopTermCond can only convert a cmp instruction to using a postincrement induction variable, which means it can't handle predicated loops where the termination condition comes from get_active_lane_mask. Relax this restriction so that we can handle any kind of instruction, though only if it's the instruction immediately before the branch (except for possibly an extractelement).
1 parent c128fd9 commit ccd4e7b

File tree

3 files changed

+245
-33
lines changed

3 files changed

+245
-33
lines changed

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2195,8 +2195,8 @@ class LSRInstance {
21952195
SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
21962196

21972197
void OptimizeShadowIV();
2198-
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2199-
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2198+
bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2199+
Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
22002200
void OptimizeLoopTermCond();
22012201

22022202
void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
@@ -2431,7 +2431,7 @@ void LSRInstance::OptimizeShadowIV() {
24312431

24322432
/// If Cond has an operand that is an expression of an IV, set the IV user and
24332433
/// stride information and return true, otherwise return false.
2434-
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2434+
bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
24352435
for (IVStrideUse &U : IU)
24362436
if (U.getUser() == Cond) {
24372437
// NOTE: we could handle setcc instructions with multiple uses here, but
@@ -2491,7 +2491,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
24912491
/// This function solves this problem by detecting this type of loop and
24922492
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
24932493
/// the instructions for the maximum computation.
2494-
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2494+
Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
24952495
// Check that the loop matches the pattern we're looking for.
24962496
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
24972497
Cond->getPredicate() != CmpInst::ICMP_NE)
@@ -2635,15 +2635,22 @@ LSRInstance::OptimizeLoopTermCond() {
26352635
// one register value.
26362636

26372637
BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2638-
if (!TermBr)
2638+
if (!TermBr || TermBr->isUnconditional())
26392639
continue;
2640-
// FIXME: Overly conservative, termination condition could be an 'or' etc..
2641-
if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2640+
2641+
Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
2642+
// If the argument to TermBr is an extractelement, then the source of that
2643+
// instruction is what's generated the condition.
2644+
auto *Extract = dyn_cast_or_null<ExtractElementInst>(Cond);
2645+
if (Extract)
2646+
Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2647+
// FIXME: We could do more here, like handling logical operations where one
2648+
// side is a cmp that uses an induction variable.
2649+
if (!Cond)
26422650
continue;
26432651

26442652
// Search IVUsesByStride to find Cond's IVUse if there is one.
26452653
IVStrideUse *CondUse = nullptr;
2646-
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
26472654
if (!FindIVUserForCond(Cond, CondUse))
26482655
continue;
26492656

@@ -2653,7 +2660,8 @@ LSRInstance::OptimizeLoopTermCond() {
26532660
// One consequence of doing this now is that it disrupts the count-down
26542661
// optimization. That's not always a bad thing though, because in such
26552662
// cases it may still be worthwhile to avoid a max.
2656-
Cond = OptimizeMax(Cond, CondUse);
2663+
if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2664+
Cond = OptimizeMax(Cmp, CondUse);
26572665

26582666
// If this exiting block dominates the latch block, it may also use
26592667
// the post-inc value if it won't be shared with other uses.
@@ -2718,13 +2726,14 @@ LSRInstance::OptimizeLoopTermCond() {
27182726
// It's possible for the setcc instruction to be anywhere in the loop, and
27192727
// possible for it to have multiple users. If it is not immediately before
27202728
// the exiting block branch, move it.
2721-
if (Cond->getNextNode() != TermBr) {
2729+
if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2730+
!Extract) {
27222731
if (Cond->hasOneUse()) {
27232732
Cond->moveBefore(TermBr->getIterator());
27242733
} else {
27252734
// Clone the terminating condition and insert into the loopend.
2726-
ICmpInst *OldCond = Cond;
2727-
Cond = cast<ICmpInst>(Cond->clone());
2735+
Instruction *OldCond = Cond;
2736+
Cond = Cond->clone();
27282737
Cond->setName(L->getHeader()->getName() + ".termcond");
27292738
Cond->insertInto(ExitingBlock, TermBr->getIterator());
27302739

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,32 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1616
; CHECK: // %bb.0: // %entry
1717
; CHECK-NEXT: movi v0.2d, #0000000000000000
1818
; CHECK-NEXT: movi v1.2d, #0000000000000000
19-
; CHECK-NEXT: mov w8, #100 // =0x64
20-
; CHECK-NEXT: whilelo p1.d, xzr, x8
21-
; CHECK-NEXT: cntd x9
22-
; CHECK-NEXT: rdvl x10, #2
19+
; CHECK-NEXT: mov w9, #100 // =0x64
20+
; CHECK-NEXT: whilelo p1.d, xzr, x9
21+
; CHECK-NEXT: mov x8, xzr
22+
; CHECK-NEXT: cntd x10
2323
; CHECK-NEXT: ptrue p0.d
24-
; CHECK-NEXT: mov x11, x9
24+
; CHECK-NEXT: rdvl x11, #2
2525
; CHECK-NEXT: .LBB0_1: // %vector.body
2626
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
2727
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
2828
; CHECK-NEXT: mov z6.d, z0.d
2929
; CHECK-NEXT: mov z7.d, z1.d
3030
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
31+
; CHECK-NEXT: add x8, x8, x10
3132
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
3233
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
3334
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
3435
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
35-
; CHECK-NEXT: add x1, x1, x10
36-
; CHECK-NEXT: add x0, x0, x10
36+
; CHECK-NEXT: add x1, x1, x11
37+
; CHECK-NEXT: add x0, x0, x11
3738
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
3839
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
3940
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
4041
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
4142
; CHECK-NEXT: mov z1.d, p2/m, z7.d
4243
; CHECK-NEXT: mov z0.d, p1/m, z6.d
43-
; CHECK-NEXT: whilelo p1.d, x11, x8
44-
; CHECK-NEXT: add x11, x11, x9
44+
; CHECK-NEXT: whilelo p1.d, x8, x9
4545
; CHECK-NEXT: b.mi .LBB0_1
4646
; CHECK-NEXT: // %bb.2: // %exit.block
4747
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
@@ -213,19 +213,18 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
213213
; CHECK: // %bb.0: // %entry
214214
; CHECK-NEXT: movi v0.2d, #0000000000000000
215215
; CHECK-NEXT: movi v1.2d, #0000000000000000
216-
; CHECK-NEXT: mov w8, #100 // =0x64
217-
; CHECK-NEXT: whilelo p1.d, xzr, x8
218-
; CHECK-NEXT: cntd x9
219-
; CHECK-NEXT: rdvl x10, #2
216+
; CHECK-NEXT: mov w9, #100 // =0x64
217+
; CHECK-NEXT: whilelo p1.d, xzr, x9
218+
; CHECK-NEXT: mov x8, xzr
219+
; CHECK-NEXT: cntd x10
220220
; CHECK-NEXT: ptrue p0.d
221-
; CHECK-NEXT: cnth x11
222-
; CHECK-NEXT: mov x12, x9
221+
; CHECK-NEXT: rdvl x11, #2
223222
; CHECK-NEXT: .LBB2_1: // %vector.body
224223
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
225-
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
224+
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x8, lsl #2]
226225
; CHECK-NEXT: mov z6.d, z0.d
227226
; CHECK-NEXT: mov z7.d, z1.d
228-
; CHECK-NEXT: add x2, x2, x11
227+
; CHECK-NEXT: add x8, x8, x10
229228
; CHECK-NEXT: and z2.d, z2.d, #0xffffffff
230229
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
231230
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
@@ -234,16 +233,15 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
234233
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
235234
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
236235
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
237-
; CHECK-NEXT: add x1, x1, x10
238-
; CHECK-NEXT: add x0, x0, x10
236+
; CHECK-NEXT: add x1, x1, x11
237+
; CHECK-NEXT: add x0, x0, x11
239238
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
240239
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
241240
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
242241
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
243242
; CHECK-NEXT: mov z1.d, p2/m, z7.d
244243
; CHECK-NEXT: mov z0.d, p1/m, z6.d
245-
; CHECK-NEXT: whilelo p1.d, x12, x8
246-
; CHECK-NEXT: add x12, x12, x9
244+
; CHECK-NEXT: whilelo p1.d, x8, x9
247245
; CHECK-NEXT: b.mi .LBB2_1
248246
; CHECK-NEXT: // %bb.2: // %exit.block
249247
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -loop-reduce %s -S -o - | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
; Tests where the loop termination condition is not generated by a compare.
7+
8+
; The call to get.active.lane.mask in the loop should use the postincrement
9+
; value of %index.
10+
define void @lane_mask(ptr %dst, i64 %n) #0 {
11+
; CHECK-LABEL: define void @lane_mask(
12+
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
13+
; CHECK-NEXT: [[ENTRY:.*]]:
14+
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
15+
; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
16+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
17+
; CHECK-NEXT: br label %[[LOOP:.*]]
18+
; CHECK: [[LOOP]]:
19+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[LOOP]] ]
20+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
21+
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2
22+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
23+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
24+
; CHECK-NEXT: [[TMP1]] = add i64 [[IV]], [[VSCALEX4]]
25+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]])
26+
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
27+
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
28+
; CHECK: [[EXIT]]:
29+
; CHECK-NEXT: ret void
30+
;
31+
entry:
32+
%vscale = tail call i64 @llvm.vscale.i64()
33+
%vscalex4 = shl i64 %vscale, 2
34+
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
35+
br label %loop
36+
37+
loop:
38+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
39+
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
40+
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
41+
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
42+
%iv.next = add i64 %iv, %vscalex4
43+
%active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
44+
%cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
45+
br i1 %cond, label %loop, label %exit
46+
47+
exit:
48+
ret void
49+
}
50+
51+
; The store between the call and the branch shouldn't prevent the
52+
; postincement value from being used.
53+
define void @lane_mask_not_last(ptr %dst, i64 %n) #0 {
54+
; CHECK-LABEL: define void @lane_mask_not_last(
55+
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
56+
; CHECK-NEXT: [[ENTRY:.*]]:
57+
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
58+
; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
59+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
60+
; CHECK-NEXT: br label %[[LOOP:.*]]
61+
; CHECK: [[LOOP]]:
62+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ]
63+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
64+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IV]], 2
65+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
66+
; CHECK-NEXT: [[TMP0]] = add i64 [[IV]], [[VSCALEX4]]
67+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP0]], i64 [[N]])
68+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
69+
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
70+
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
71+
; CHECK: [[EXIT]]:
72+
; CHECK-NEXT: ret void
73+
;
74+
entry:
75+
%vscale = tail call i64 @llvm.vscale.i64()
76+
%vscalex4 = shl i64 %vscale, 2
77+
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
78+
br label %loop
79+
80+
loop:
81+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
82+
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
83+
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
84+
%iv.next = add i64 %iv, %vscalex4
85+
%active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
86+
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
87+
%cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
88+
br i1 %cond, label %loop, label %exit
89+
90+
exit:
91+
ret void
92+
}
93+
94+
; The call to cmp_fn in the loop should use the postincrement value of %iv.
95+
define void @uses_cmp_fn(ptr %dst, i64 %n) {
96+
; CHECK-LABEL: define void @uses_cmp_fn(
97+
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
98+
; CHECK-NEXT: [[ENTRY:.*]]:
99+
; CHECK-NEXT: br label %[[LOOP:.*]]
100+
; CHECK: [[LOOP]]:
101+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ]
102+
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[LSR_IV]], 2
103+
; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
104+
; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4
105+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1
106+
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV_NEXT]])
107+
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
108+
; CHECK: [[EXIT]]:
109+
; CHECK-NEXT: ret void
110+
;
111+
entry:
112+
br label %loop
113+
114+
loop:
115+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
116+
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
117+
store i32 0, ptr %gep, align 4
118+
%iv.next = add i64 %iv, 1
119+
%cond = tail call i1 @cmp_fn(i64 %iv.next)
120+
br i1 %cond, label %loop, label %exit
121+
122+
exit:
123+
ret void
124+
}
125+
126+
; The store between the call and the branch shouldn't prevent the
127+
; postincement value from being used.
128+
define void @uses_cmp_fn_not_last(ptr %dst, i64 %n) {
129+
; CHECK-LABEL: define void @uses_cmp_fn_not_last(
130+
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
131+
; CHECK-NEXT: [[ENTRY:.*]]:
132+
; CHECK-NEXT: br label %[[LOOP:.*]]
133+
; CHECK: [[LOOP]]:
134+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV:%.*]], %[[LOOP]] ]
135+
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2
136+
; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
137+
; CHECK-NEXT: [[LSR_IV]] = add i64 [[IV]], 1
138+
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]])
139+
; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4
140+
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
141+
; CHECK: [[EXIT]]:
142+
; CHECK-NEXT: ret void
143+
;
144+
entry:
145+
br label %loop
146+
147+
loop:
148+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
149+
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
150+
%iv.next = add i64 %iv, 1
151+
%cond = tail call i1 @cmp_fn(i64 %iv.next)
152+
store i32 0, ptr %gep, align 4
153+
br i1 %cond, label %loop, label %exit
154+
155+
exit:
156+
ret void
157+
}
158+
159+
; cmp2 will use a preincrement induction variable as it isn't directly the loop
160+
; termination condition.
161+
; FIXME: We could potentially handle this by examining the operands of the 'and'
162+
; instruction.
163+
define void @cmp_and(ptr %dst, i64 %n) {
164+
; CHECK-LABEL: define void @cmp_and(
165+
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
166+
; CHECK-NEXT: [[ENTRY:.*]]:
167+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
168+
; CHECK-NEXT: br label %[[LOOP:.*]]
169+
; CHECK: [[LOOP]]:
170+
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[LOOP]] ], [ [[DST]], %[[ENTRY]] ]
171+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[TMP0]], %[[ENTRY]] ]
172+
; CHECK-NEXT: [[VAL:%.*]] = load i64, ptr [[LSR_IV1]], align 8
173+
; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[VAL]], [[N]]
174+
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[LSR_IV]], 0
175+
; CHECK-NEXT: [[COND:%.*]] = and i1 [[CMP1]], [[CMP2]]
176+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
177+
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
178+
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
179+
; CHECK: [[EXIT]]:
180+
; CHECK-NEXT: ret void
181+
;
182+
entry:
183+
br label %loop
184+
185+
loop:
186+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
187+
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
188+
%val = load i64, ptr %gep, align 8
189+
%iv.next = add i64 %iv, 1
190+
%cmp1 = icmp ne i64 %val, %n
191+
%cmp2 = icmp ne i64 %iv.next, %n
192+
%cond = and i1 %cmp1, %cmp2
193+
br i1 %cond, label %loop, label %exit
194+
195+
exit:
196+
ret void
197+
}
198+
199+
200+
declare i64 @llvm.vscale.i64()
201+
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
202+
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>)
203+
declare i1 @cmp_fn(i64)
204+
205+
attributes #0 = { "target-features"="+sve2" }

0 commit comments

Comments
 (0)