diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 1ef01e3b793d5..bf22d63850835 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" declare void @init(ptr nocapture nofree) @@ -200,104 +200,6 @@ loop_exit: } -; Test where offset relative to alloca is negative and we shouldn't -; treat predicated loads as being always dereferenceable. -define i8 @test_negative_off(i16 %len, ptr %test_base) { -; CHECK-LABEL: @test_negative_off( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64638 x i8], align 1 -; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ] -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]] -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] -; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] -; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer -; CHECK-NEXT: [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]]) -; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 -; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]] -; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 -; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] -; CHECK: pred: -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1 -; CHECK-NEXT: br label [[LATCH]] -; CHECK: latch: -; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] -; CHECK-NEXT: [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]] -; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990 -; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i8 [[ACCUM_NEXT_LCSSA]] -; -entry: - %alloca = alloca [64638 x i8] - call void @init(ptr %alloca) - br label %loop -loop: - %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ] - %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ] - %iv.next = add i16 %iv, 1 - %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv - %earlycnd = load i1, ptr %test_addr - br i1 %earlycnd, label %pred, label %latch -pred: - %addr = getelementptr i8, ptr %alloca, i16 %iv - %val = load i8, ptr %addr - br label %latch -latch: - %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ] - %accum.next = add i8 %accum, %val.phi - %exit = icmp ugt i16 %iv, -990 - br i1 %exit, label %loop_exit, label %loop -loop_exit: - ret i8 %accum.next -} - - define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) { ; CHECK-LABEL: @loop_requires_scev_predicate( ; CHECK-NEXT: entry: @@ -423,3 +325,430 @@ for.inc: exit: ret i32 0 } + + +; Test reverse loops where we should be able to prove loads in predicated blocks +; are safe to load unconditionally. +define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: @test_rev_loops_deref_loads( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]]) +; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2 +; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3 +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false) +; CHECK-NEXT: ret void +; +entry: + %local_dest = alloca [1024 x i32], align 4 + %local_src = alloca [1024 x i32], align 4 + %local_cmp = alloca [1024 x i32], align 4 + call void @init(ptr %local_src) + call void @init(ptr %local_cmp) + br label %for.body + +for.body: + %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ] + %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp3.not = icmp eq i32 %0, 3 + br i1 %cmp3.not, label %for.inc, label %if.then + +if.then: + %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv + %1 = load i32, ptr %arrayidx5, align 4 + %mul = shl nsw i32 %1, 2 + %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv + store i32 %mul, ptr %arrayidx7, align 4 + br label %for.inc + +for.inc: + %iv.next = add nsw i64 %iv, -1 + %cmp2.not = icmp eq i64 %iv, 0 + br i1 %cmp2.not, label %exit, label %for.body + +exit: + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false) + ret void +} + + +; Test reverse loops where we *cannot* prove loads in predicated blocks are safe +; to load unconditionally. +define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: @test_rev_loops_non_deref_loads( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]]) +; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 -1) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = shl nsw i32 [[TMP10]], 2 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP12]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nsw i32 [[TMP17]], 2 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP19]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[OFF:%.*]] = add i64 [[IV]], -1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3 +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false) +; CHECK-NEXT: ret void +; +entry: + %local_dest = alloca [1024 x i32], align 4 + %local_src = alloca [1024 x i32], align 4 + %local_cmp = alloca [1024 x i32], align 4 + call void @init(ptr %local_src) + call void @init(ptr %local_cmp) + br label %for.body + +for.body: + %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ] + %off = add i64 %iv, -1 + %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %off + %0 = load i32, ptr %arrayidx, align 4 + %cmp3.not = icmp eq i32 %0, 3 + br i1 %cmp3.not, label %for.inc, label %if.then + +if.then: + %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %off + %1 = load i32, ptr %arrayidx5, align 4 + %mul = shl nsw i32 %1, 2 + %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %off + store i32 %mul, ptr %arrayidx7, align 4 + br label %for.inc + +for.inc: + %iv.next = add nsw i64 %iv, -1 + %cmp2.not = icmp eq i64 %iv, 0 + br i1 %cmp2.not, label %exit, label %for.body + +exit: + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false) + ret void +} + + +; Test a loop with a positive step recurrence that has a strided access +define i16 @test_strided_access(i64 %len, ptr %test_base) { +; CHECK-LABEL: @test_strided_access( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [163840 x i16], align 4 +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer +; CHECK-NEXT: [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]]) +; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]] +; CHECK-NEXT: [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sge i8 [[L_T]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]] +; CHECK: pred: +; CHECK-NEXT: [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]] +; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] +; CHECK-NEXT: [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]] +; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095 +; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: loop_exit: +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i16 [[ACCUM_NEXT_LCSSA]] +; +entry: + %alloca = alloca [163840 x i16], align 4 + call void @init(ptr %alloca) + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %accum = phi i16 [ 0, %entry ], [ %accum.next, %latch ] + %iv.next = add i64 %iv, 1 + %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %iv + %l.t = load i8, ptr %test_addr + %cmp = icmp sge i8 %l.t, 0 + br i1 %cmp, label %pred, label %latch +pred: + %iv.stride = mul i64 %iv, 2 + %addr = getelementptr inbounds i16, ptr %alloca, i64 %iv.stride + %val = load i16, ptr %addr, align 2 + br label %latch +latch: + %val.phi = phi i16 [0, %loop], [%val, %pred] + %accum.next = add i16 %accum, %val.phi + %exit = icmp eq i64 %iv, 4095 + br i1 %exit, label %loop_exit, label %loop + +loop_exit: + ret i16 %accum.next +} + + +; Test a loop with a negative step recurrence that has a strided access +define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly %dest) { +; CHECK-LABEL: @test_rev_loops_strided_deref_loads( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[LOCAL_SRC]]) +; CHECK-NEXT: call void @init(ptr [[LOCAL_CMP]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 511, [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2) +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3 +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[IV_STRIDED:%.*]] = mul i64 [[IV]], 2 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV_STRIDED]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-NEXT: [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false) +; CHECK-NEXT: ret void +; +entry: + %local_dest = alloca [1024 x i32], align 4 + %local_src = alloca [1024 x i32], align 4 + %local_cmp = alloca [1024 x i32], align 4 + call void @init(ptr %local_src) + call void @init(ptr %local_cmp) + br label %for.body + +for.body: + %iv = phi i64 [ 511, %entry ], [ %iv.next, %for.inc ] + %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp3.not = icmp eq i32 %0, 3 + br i1 %cmp3.not, label %for.inc, label %if.then + +if.then: + %iv.strided = mul i64 %iv, 2 + %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv.strided + %1 = load i32, ptr %arrayidx5, align 4 + %mul = shl nsw i32 %1, 2 + %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv + store i32 %mul, ptr %arrayidx7, align 4 + br label %for.inc + +for.inc: + %iv.next = add nsw i64 %iv, -1 + %cmp2.not = icmp eq i64 %iv, 0 + br i1 %cmp2.not, label %exit, label %for.body + +exit: + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false) + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll new file mode 100644 index 0000000000000..1dd526df503bd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16" + +declare void @init(ptr nocapture nofree) + + +; Test where offset relative to alloca is negative and we shouldn't +; treat predicated loads as being always dereferenceable. +define i8 @test_negative_off(i16 %len, ptr %test_base) { +; CHECK-LABEL: @test_negative_off( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64638 x i8], align 1 +; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]] +; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer +; CHECK-NEXT: [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]]) +; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 +; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]] +; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 +; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] +; CHECK: pred: +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] +; CHECK-NEXT: [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]] +; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990 +; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: loop_exit: +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[ACCUM_NEXT_LCSSA]] +; +entry: + %alloca = alloca [64638 x i8] + call void @init(ptr %alloca) + br label %loop +loop: + %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ] + %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ] + %iv.next = add i16 %iv, 1 + %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv + %earlycnd = load i1, ptr %test_addr + br i1 %earlycnd, label %pred, label %latch +pred: + %addr = getelementptr i8, ptr %alloca, i16 %iv + %val = load i8, ptr %addr + br label %latch +latch: + %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ] + %accum.next = add i8 %accum, %val.phi + %exit = icmp ugt i16 %iv, -990 + br i1 %exit, label %loop_exit, label %loop +loop_exit: + ret i8 %accum.next +}