diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index dc8fa4379752f..0e5c96851d3a2 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1420,8 +1420,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, } unsigned LoopCost = 1; - if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || - TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { + if (LU.Kind == LSRUse::Address && + (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || + TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType()))) { const SCEV *Start; const SCEVConstant *Step; if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index 9c36bae6fac13..ec257bcf123f3 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: ldr.w r12, [r0] ; CHECK-NEXT: subs.w r9, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: and r6, r9, #3 ; CHECK-NEXT: subs r7, r1, #2 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: b .LBB0_6 +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: cbnz r6, .LBB0_7 +; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new ; CHECK-NEXT: bic r7, r9, #3 -; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r7, #4 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r10, [r0, #16]! -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrd r5, r4, [r0, #-12] -; CHECK-NEXT: ldr r11, [r0, #-4] +; CHECK-NEXT: ldr r11, [r0, #16]! +; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 -; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #3 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: cmp r5, r4 +; CHECK-NEXT: csinc r6, r10, r8, le +; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #2 -; CHECK-NEXT: csel r5, r4, r5, gt -; CHECK-NEXT: cmp r5, r11 +; CHECK-NEXT: addgt.w r6, r8, #2 +; CHECK-NEXT: csel r7, r7, r5, gt +; CHECK-NEXT: cmp r7, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: subgt r6, r7, #1 -; CHECK-NEXT: csel r5, r11, r5, gt -; CHECK-NEXT: cmp r5, r10 -; CHECK-NEXT: csel r6, r7, r6, gt -; CHECK-NEXT: add.w r7, r7, #4 -; CHECK-NEXT: csel r12, r10, r5, gt +; CHECK-NEXT: addgt.w r6, r8, #3 +; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: cmp r7, r11 +; CHECK-NEXT: csel r10, r8, r6, gt +; CHECK-NEXT: csel r12, r11, r7, gt ; CHECK-NEXT: le lr, .LBB0_5 -; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: beq .LBB0_10 -; CHECK-NEXT: @ %bb.7: @ %while.body.epil +; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit +; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload +; CHECK-NEXT: sub.w r9, r9, r8 +; CHECK-NEXT: cbz r6, .LBB0_10 +; CHECK-NEXT: .LBB0_7: @ %while.body.epil ; CHECK-NEXT: ldr r7, [r0, #4] ; CHECK-NEXT: sub.w r1, r1, r9 ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csel r6, r1, r6, gt +; CHECK-NEXT: csel r10, r1, r10, gt ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #1 +; CHECK-NEXT: cmp r6, #1 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1 ; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: cmp r12, r7 -; CHECK-NEXT: csinc r6, r6, r1, le +; CHECK-NEXT: csinc r10, r10, r1, le ; CHECK-NEXT: csel r12, r7, r12, gt -; CHECK-NEXT: cmp.w r8, #2 +; CHECK-NEXT: cmp r6, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: cmp r12, r0 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt r6, r1, #2 +; CHECK-NEXT: addgt.w r10, r1, #2 ; CHECK-NEXT: csel r12, r0, r12, gt ; CHECK-NEXT: .LBB0_10: @ %while.end ; CHECK-NEXT: str.w r12, [r2] -; CHECK-NEXT: str r6, [r3] +; CHECK-NEXT: str.w r10, [r3] +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %0 = load i32, ptr %pSrc, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll index 96aff0233e4d9..9c8ef2ed899cf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -1,24 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s +; FIXME: Loop strength reduction makes suboptimal choices here due to the +; isLSRCostLess function preferring to minimise the number of addrecs even +; when it increases the total number of adds. + define void @ptr_iv_v4i32(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i32 %y) { ; CHECK-LABEL: ptr_iv_v4i32: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr r3, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r3, r1, r12 +; CHECK-NEXT: vldrw.u32 q1, [r4, q0, uxtw #2] +; CHECK-NEXT: add.w r12, r12, #64 ; CHECK-NEXT: vadd.i32 q1, q1, r2 -; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrw.32 q1, [r3, q0, uxtw #2] ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: @@ -110,21 +116,23 @@ end: define void @ptr_iv_v8i16(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i16 %y) { ; CHECK-LABEL: ptr_iv_v8i16: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr r3, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r3, r1, r12 +; CHECK-NEXT: vldrh.u16 q1, [r4, q0, uxtw #1] +; CHECK-NEXT: add.w r12, r12, #64 ; CHECK-NEXT: vadd.i16 q1, q1, r2 -; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrh.16 q1, [r3, q0, uxtw #1] ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: @@ -164,23 +172,25 @@ end: define void @ptr_iv_v8i16_mult(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i16 %y) { ; CHECK-LABEL: ptr_iv_v8i16_mult: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: adr r3, .LCPI3_1 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: adr r4, .LCPI3_1 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q2, [r0, q0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: adds r4, r0, r3 +; CHECK-NEXT: add.w r12, r1, r3 +; CHECK-NEXT: vldrh.u16 q2, [r4, q1] +; CHECK-NEXT: adds r3, #64 ; CHECK-NEXT: vadd.i16 q2, q2, r2 -; CHECK-NEXT: vstrh.16 q2, [r1, q1] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrh.16 q2, [r12, q0] ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: @@ -230,21 +240,23 @@ end: define void @ptr_iv_v16i8(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i8 %y) { ; CHECK-LABEL: ptr_iv_v16i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr r3, .LCPI4_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q1, [r0, q0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r3, r1, r12 +; CHECK-NEXT: vldrb.u8 q1, [r4, q0] +; CHECK-NEXT: add.w r12, r12, #64 ; CHECK-NEXT: vadd.i8 q1, q1, r2 -; CHECK-NEXT: vstrb.8 q1, [r1, q0] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrb.8 q1, [r3, q0] ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI4_0: @@ -292,23 +304,25 @@ end: define void @ptr_iv_v16i8_mult(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i8 %y) { ; CHECK-LABEL: ptr_iv_v16i8_mult: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr.w r12, .LCPI5_0 -; CHECK-NEXT: adr r3, .LCPI5_1 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: adr r4, .LCPI5_1 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q2, [r0, q0] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: adds r4, r0, r3 +; CHECK-NEXT: add.w r12, r1, r3 +; CHECK-NEXT: vldrb.u8 q2, [r4, q1] +; CHECK-NEXT: adds r3, #64 ; CHECK-NEXT: vadd.i8 q2, q2, r2 -; CHECK-NEXT: vstrb.8 q2, [r1, q1] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrb.8 q2, [r12, q0] ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI5_0: @@ -374,21 +388,23 @@ end: define void @ptr_iv_v4f32(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, float %y) { ; CHECK-LABEL: ptr_iv_v4f32: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: adr r3, .LCPI6_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r3, r1, r12 +; CHECK-NEXT: vldrw.u32 q1, [r4, q0, uxtw #2] +; CHECK-NEXT: add.w r12, r12, #64 ; CHECK-NEXT: vadd.f32 q1, q1, r2 -; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: vstrw.32 q1, [r3, q0, uxtw #2] ; CHECK-NEXT: le lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: @@ -485,16 +501,18 @@ define void @ptr_iv_v8f16(ptr noalias nocapture readonly %A, ptr noalias nocaptu ; CHECK-NEXT: vmov s0, r2 ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: adr r3, .LCPI8_0 -; CHECK-NEXT: vmov.f16 r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r2, .LCPI8_0 +; CHECK-NEXT: vmov.f16 r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] -; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vadd.f16 q1, q1, r2 -; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] -; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: adds r2, r0, r3 +; CHECK-NEXT: vldrh.u16 q1, [r2, q0, uxtw #1] +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: adds r3, #64 +; CHECK-NEXT: vadd.f16 q1, q1, r12 +; CHECK-NEXT: vstrh.16 q1, [r2, q0, uxtw #1] ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/cannot_pre_post_idx.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/cannot_pre_post_idx.ll new file mode 100644 index 0000000000000..96dd9a503cc64 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/cannot_pre_post_idx.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt < %s -loop-reduce -lsr-preferred-addressing-mode=none -S | FileCheck %s --check-prefixes=CHECK,CHECK-NONE +; RUN: opt < %s -loop-reduce -lsr-preferred-addressing-mode=preindexed -S | FileCheck %s --check-prefixes=CHECK,CHECK-PREINDEXED +; RUN: opt < %s -loop-reduce -lsr-preferred-addressing-mode=postindexed -S | FileCheck %s --check-prefixes=CHECK,CHECK-POSTINDEXED + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; This is an example where we should always pre/postincrement, as it can be +; folded into the load. +define i32 @has_load(ptr %p, i32 %n) { +; CHECK-NONE-LABEL: define i32 @has_load( +; CHECK-NONE-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) { +; CHECK-NONE-NEXT: entry: +; CHECK-NONE-NEXT: br label [[LOOP:%.*]] +; CHECK-NONE: loop: +; CHECK-NONE-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-NONE-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ [[N]], [[ENTRY]] ] +; CHECK-NONE-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NONE-NEXT: [[LOAD:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-NONE-NEXT: [[ADD]] = add i32 [[ACC]], [[LOAD]] +; CHECK-NONE-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-NONE-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i32 4 +; CHECK-NONE-NEXT: [[COND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-NONE-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NONE: exit: +; CHECK-NONE-NEXT: ret i32 [[ACC]] +; +; CHECK-PREINDEXED-LABEL: define i32 @has_load( +; CHECK-PREINDEXED-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) { +; CHECK-PREINDEXED-NEXT: entry: +; CHECK-PREINDEXED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i32 -4 +; CHECK-PREINDEXED-NEXT: br label [[LOOP:%.*]] +; CHECK-PREINDEXED: loop: +; CHECK-PREINDEXED-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[LOOP]] ], [ [[SCEVGEP]], [[ENTRY:%.*]] ] +; CHECK-PREINDEXED-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ [[N]], [[ENTRY]] ] +; CHECK-PREINDEXED-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-PREINDEXED-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i32 4 +; CHECK-PREINDEXED-NEXT: [[LOAD:%.*]] = load i32, ptr [[SCEVGEP3]], align 4 +; CHECK-PREINDEXED-NEXT: [[ADD]] = add i32 [[ACC]], [[LOAD]] +; CHECK-PREINDEXED-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-PREINDEXED-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 4 +; CHECK-PREINDEXED-NEXT: [[COND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-PREINDEXED-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-PREINDEXED: exit: +; CHECK-PREINDEXED-NEXT: ret i32 [[ACC]] +; +; CHECK-POSTINDEXED-LABEL: define i32 @has_load( +; CHECK-POSTINDEXED-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) { +; CHECK-POSTINDEXED-NEXT: entry: +; CHECK-POSTINDEXED-NEXT: br label [[LOOP:%.*]] +; CHECK-POSTINDEXED: loop: +; CHECK-POSTINDEXED-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[P]], [[ENTRY:%.*]] ] +; CHECK-POSTINDEXED-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ [[N]], [[ENTRY]] ] +; CHECK-POSTINDEXED-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-POSTINDEXED-NEXT: [[LOAD:%.*]] = load i32, ptr [[LSR_IV1]], align 4 +; CHECK-POSTINDEXED-NEXT: [[ADD]] = add i32 [[ACC]], [[LOAD]] +; CHECK-POSTINDEXED-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -1 +; CHECK-POSTINDEXED-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i32 4 +; CHECK-POSTINDEXED-NEXT: [[COND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; CHECK-POSTINDEXED-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-POSTINDEXED: exit: +; CHECK-POSTINDEXED-NEXT: ret i32 [[ACC]] +; +entry: + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %acc = phi i32 [ 0, %entry ], [ %add, %loop ] + %gep = getelementptr i32, ptr %p, i32 %idx + %load = load i32, ptr %gep, align 4 + %add = add i32 %acc, %load + %idx.next = add nuw i32 %idx, 1 + %cond = icmp eq i32 %idx.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret i32 %acc +} + +; Here there's no load, so there's nothing to fold a pre/postincrement into. +define i32 @no_mem_access(i32 %n) { +; CHECK-LABEL: define i32 @no_mem_access( +; CHECK-SAME: i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ADD]] = add i32 [[ACC]], [[IDX]] +; CHECK-NEXT: [[IDX_NEXT]] = add nuw i32 [[IDX]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[IDX_NEXT]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 [[ACC]] +; +entry: + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %acc = phi i32 [ 0, %entry ], [ %add, %loop ] + %add = add i32 %acc, %idx + %idx.next = add nuw i32 %idx, 1 + %cond = icmp eq i32 %idx.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret i32 %acc +} + +; Here there's a load, but the address is generated in a way that means +; pre/postincrement isn't possible. +define i32 @has_load_bad_addr(ptr %p, i32 %n) { +; CHECK-LABEL: define i32 @has_load_bad_addr( +; CHECK-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[IDX]], 64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[AND]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[ADD]] = add i32 [[ACC]], [[LOAD]] +; CHECK-NEXT: [[IDX_NEXT]] = add nuw i32 [[IDX]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[IDX_NEXT]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 [[ACC]] +; +entry: + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %acc = phi i32 [ 0, %entry ], [ %add, %loop ] + %and = and i32 %idx, 64 + %gep = getelementptr i32, ptr %p, i32 %and + %load = load i32, ptr %gep, align 4 + %add = add i32 %acc, %load + %idx.next = add nuw i32 %idx, 1 + %cond = icmp eq i32 %idx.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret i32 %acc +}