diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 05d504cbcb6bb..6a1f4b3e3bedf 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -114,6 +114,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableScalableVectorization() const override { return ST->hasVInstructions(); } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override { + return ST->hasVInstructions(); + } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll index 7b56ba872d727..be6e9187d90d2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll @@ -26,37 +26,39 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; ZVFBFMIN-LABEL: define void @fadd( ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; ZVFBFMIN-NEXT: [[ENTRY:.*]]: -; ZVFBFMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; ZVFBFMIN-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8 -; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]] -; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFBFMIN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; ZVFBFMIN: [[VECTOR_PH]]: ; ZVFBFMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; ZVFBFMIN-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]] -; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFBFMIN-NEXT: [[TMP3:%.*]] = sub i64 [[TMP10]], 1 +; ZVFBFMIN-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP3]] +; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; ZVFBFMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 8 ; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]] ; ZVFBFMIN: [[VECTOR_BODY]]: -; ZVFBFMIN-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; ZVFBFMIN-NEXT: [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]] ; ZVFBFMIN-NEXT: [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]] -; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 2 -; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP2]], align 2 +; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8bf16.p0(ptr align 2 [[TMP1]], splat (i1 true), i32 [[TMP6]]) +; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = call @llvm.vp.load.nxv8bf16.p0(ptr align 2 [[TMP2]], splat (i1 true), i32 [[TMP6]]) ; ZVFBFMIN-NEXT: [[TMP11:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; ZVFBFMIN-NEXT: store [[TMP11]], ptr [[TMP1]], align 2 -; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], [[TMP5]] -; ZVFBFMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; ZVFBFMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ZVFBFMIN-NEXT: call void @llvm.vp.store.nxv8bf16.p0( [[TMP11]], ptr align 2 [[TMP1]], splat (i1 true), i32 [[TMP6]]) +; ZVFBFMIN-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; ZVFBFMIN-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[TMP0]] +; ZVFBFMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; ZVFBFMIN-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; ZVFBFMIN-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; ZVFBFMIN: [[MIDDLE_BLOCK]]: -; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFBFMIN-NEXT: br label %[[EXIT:.*]] ; ZVFBFMIN: [[SCALAR_PH]]: -; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; ZVFBFMIN-NEXT: br label %[[LOOP:.*]] ; ZVFBFMIN: [[LOOP]]: -; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] ; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] ; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] ; ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 @@ -65,7 +67,7 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; ZVFBFMIN-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2 ; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] -; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; ZVFBFMIN: [[EXIT]]: ; ZVFBFMIN-NEXT: ret void ; @@ -138,41 +140,43 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 ; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv( ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; ZVFBFMIN-NEXT: [[ENTRY:.*]]: -; ZVFBFMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; ZVFBFMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFBFMIN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; ZVFBFMIN: [[VECTOR_PH]]: ; ZVFBFMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; ZVFBFMIN-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFBFMIN-NEXT: [[TMP10:%.*]] = sub i64 [[TMP3]], 1 +; ZVFBFMIN-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP10]] +; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; ZVFBFMIN-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]] ; ZVFBFMIN: [[VECTOR_BODY]]: -; ZVFBFMIN-NEXT: [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; ZVFBFMIN-NEXT: [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]] ; ZVFBFMIN-NEXT: [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]] ; ZVFBFMIN-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]] -; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 2 -; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 2 -; ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 4 +; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4bf16.p0(ptr align 2 [[TMP7]], splat (i1 true), i32 [[TMP11]]) +; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = call @llvm.vp.load.nxv4bf16.p0(ptr align 2 [[TMP8]], splat (i1 true), i32 [[TMP11]]) +; ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP9]], splat (i1 true), i32 [[TMP11]]) ; ZVFBFMIN-NEXT: [[TMP13:%.*]] = fpext [[WIDE_LOAD]] to ; ZVFBFMIN-NEXT: [[TMP14:%.*]] = fpext [[WIDE_LOAD1]] to ; ZVFBFMIN-NEXT: [[TMP15:%.*]] = call @llvm.fmuladd.nxv4f32( [[TMP13]], [[TMP14]], [[WIDE_LOAD2]]) -; ZVFBFMIN-NEXT: store [[TMP15]], ptr [[TMP9]], align 4 -; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP6]], [[TMP5]] -; ZVFBFMIN-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; ZVFBFMIN-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; ZVFBFMIN-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[TMP15]], ptr align 4 [[TMP9]], splat (i1 true), i32 [[TMP11]]) +; ZVFBFMIN-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 +; ZVFBFMIN-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[TMP6]] +; ZVFBFMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] +; ZVFBFMIN-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; ZVFBFMIN-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; ZVFBFMIN: [[MIDDLE_BLOCK]]: -; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFBFMIN-NEXT: br label %[[EXIT:.*]] ; ZVFBFMIN: [[SCALAR_PH]]: -; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; ZVFBFMIN-NEXT: br label %[[LOOP:.*]] ; ZVFBFMIN: [[LOOP]]: -; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] ; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] ; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] ; ZVFBFMIN-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]] @@ -185,7 +189,7 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 ; ZVFBFMIN-NEXT: store float [[FMULADD]], ptr [[C_GEP]], align 4 ; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] -; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; ZVFBFMIN: [[EXIT]]: ; ZVFBFMIN-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll index 75ae6df5fcd37..9f7ac7ac1771f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll @@ -62,50 +62,10 @@ define i32 @any_of_reduction_used_in_blend_with_multiple_phis(ptr %src, i64 %N, ; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_multiple_phis( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i1 [[C_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C_0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: [[TMP7:%.*]] = xor [[BROADCAST_SPLAT2]], splat (i1 true) -; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[SRC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2p0.nxv2p0( [[BROADCAST_SPLAT4]], i32 8, [[TMP8]], poison) -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = or [[VEC_PHI]], [[TMP9]] -; CHECK-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP10]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[PREDPHI]]) -; CHECK-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 0, i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: br i1 [[C_0]], label %[[X_1:.*]], label %[[ELSE_1:.*]] ; CHECK: [[ELSE_1]]: ; CHECK-NEXT: br i1 [[C_1]], label %[[X_1]], label %[[ELSE_2:.*]] @@ -121,9 +81,9 @@ define i32 @any_of_reduction_used_in_blend_with_multiple_phis(ptr %src, i64 %N, ; CHECK-NEXT: [[ANY_OF_RED_NEXT]] = phi i32 [ [[P]], %[[X_1]] ], [ [[SEL]], %[[ELSE_2]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: ret i32 [[RES]] ; entry: @@ -159,9 +119,3 @@ exit: } attributes #0 = { "target-cpu"="sifive-p670" } -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index aad9128a240de..3a77d2e50ee3d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -11,45 +11,46 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -3 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 ; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul [[TMP11]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP10]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP17]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP20]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -62,7 +63,7 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP25]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -96,45 +97,46 @@ define void @block_with_dead_inst_2(ptr %src) #0 { ; CHECK-LABEL: define void @block_with_dead_inst_2( ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 333, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 333, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 3) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP5]], splat (i64 3) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( zeroinitializer, [[TMP17]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -147,7 +149,7 @@ define void @block_with_dead_inst_2(ptr %src) #0 { ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -181,45 +183,46 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 { ; CHECK-LABEL: define void @multiple_blocks_with_dead_insts_3( ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 333, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 333, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 3) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP5]], splat (i64 3) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( zeroinitializer, [[TMP17]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -235,7 +238,7 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 { ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -276,45 +279,46 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -3 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 ; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul [[TMP11]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP10]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP17]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP20]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -332,7 +336,7 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 { ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -373,45 +377,46 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 { ; CHECK-LABEL: define void @multiple_blocks_with_dead_inst_multiple_successors_5( ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 333, [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 333, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 3) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP5]], splat (i64 3) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( zeroinitializer, [[TMP17]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 3, [[TMP7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 333 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -431,7 +436,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 { ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -478,45 +483,62 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -3 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[IC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) ; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul [[TMP11]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP10]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP27]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP12]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP14]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP20]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv8i16.nxv8p0( align 2 [[TMP20]], splat (i1 true), i32 [[TMP27]]) +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP15]], [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select [[TMP18]], [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = xor [[TMP17]], splat (i1 true) +; CHECK-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = or [[TMP19]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = select [[TMP18]], [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or [[TMP22]], [[TMP23]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], [[TMP24]], i32 [[TMP27]]) +; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i16 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -536,7 +558,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: store i16 [[XOR]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -580,38 +602,45 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 { ; CHECK-LABEL: define void @empty_block_with_phi_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], splat (i16 99), [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP10]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP7]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[VP_OP_LOAD]], splat (i16 99) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i16.p0( [[PREDPHI]], ptr align 2 [[TMP10]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i32 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -624,7 +653,7 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: store i16 [[P]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -658,38 +687,45 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 { ; CHECK-LABEL: define void @empty_block_with_phi_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP10]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], [[WIDE_LOAD]], splat (i16 99) -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP7]], [[TMP12]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], [[WIDE_LOAD]], splat (i16 99) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i16.p0( [[PREDPHI]], ptr align 2 [[TMP10]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] -; CHECK-NEXT: [[XOR1315:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[XOR1315:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ] ; CHECK-NEXT: [[XOR]] = xor i32 0, 0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 @@ -702,7 +738,7 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 { ; CHECK-NEXT: store i16 [[P]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP18]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -743,11 +779,7 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN7]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.umax.i64(i64 40, i64 [[TMP5]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP6]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N_EXT]], i64 1) ; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N_EXT]], [[UMIN]] @@ -771,33 +803,38 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP14]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP19]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP3]], [[TMP20]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP19]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP18]], splat (i64 3) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 3, [[TMP17]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP23]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul [[TMP24]], splat (i64 3) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP25]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( zeroinitializer, [[TMP24]], i32 4, splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META21:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP3]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP18]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 3, [[TMP17]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP23]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( zeroinitializer, align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]), !alias.scope [[META19:![0-9]+]], !noalias [[META22:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP18]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -813,7 +850,7 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: store i32 0, ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N_EXT]] -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -845,30 +882,31 @@ exit: attributes #0 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} -; CHECK: [[META18]] = !{[[META19:![0-9]+]]} -; CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]]} -; CHECK: [[META20]] = distinct !{[[META20]], !"LVerDomain"} -; CHECK: [[META21]] = !{[[META22:![0-9]+]], [[META23:![0-9]+]]} -; CHECK: [[META22]] = distinct !{[[META22]], [[META20]]} -; CHECK: [[META23]] = distinct !{[[META23]], [[META20]]} -; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} -; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META1]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META1]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]], [[META1]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META3]], [[META1]]} +; CHECK: [[META19]] = !{[[META20:![0-9]+]]} +; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]]} +; CHECK: [[META21]] = distinct !{[[META21]], !"LVerDomain"} +; CHECK: [[META22]] = !{[[META23:![0-9]+]], [[META24:![0-9]+]]} +; CHECK: [[META23]] = distinct !{[[META23]], [[META21]]} +; CHECK: [[META24]] = distinct !{[[META24]], [[META21]]} +; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index ab8875bc2a825..18757a7a02e17 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -18,52 +18,52 @@ define void @dead_load(ptr %p, i16 %start) { ; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UMIN]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]] ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[START_EXT]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = mul [[TMP15]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 3, [[TMP14]] -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP5]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP16]] to i64 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 3, [[TMP19]] +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[P]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP21]], i32 2, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP21]], splat (i1 true), i32 [[TMP16]]) +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP16]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT2]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START_EXT]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[START_EXT]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[START_EXT]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: store i16 0, ptr [[GEP]], align 2 ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IV]], 111 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -94,7 +94,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 6, i32 [[TMP1]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -126,11 +126,11 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP15:%.*]] = sext [[VEC_IND]] to ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], [[TMP15]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[TMP16]], i32 1, splat (i1 true)), !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[TMP16]], i32 1, splat (i1 true)), !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -145,7 +145,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ] ; CHECK-NEXT: ret i8 [[R]] @@ -181,7 +181,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 770, [[UMAX3]] ; CHECK-NEXT: [[SMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[SMAX4]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 24 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 19 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 1 @@ -210,48 +210,48 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]] ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12:![0-9]+]], !noalias [[META15:![0-9]+]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; CHECK: [[PRED_STORE_IF7]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; CHECK: [[PRED_STORE_CONTINUE8]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; CHECK: [[PRED_STORE_IF9]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; CHECK: [[PRED_STORE_CONTINUE10]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; CHECK: [[PRED_STORE_IF11]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; CHECK: [[PRED_STORE_CONTINUE12]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; CHECK: [[PRED_STORE_IF13]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; CHECK: [[PRED_STORE_CONTINUE14]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] ; CHECK: [[PRED_STORE_IF15]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]] ; CHECK: [[PRED_STORE_CONTINUE16]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18]] ; CHECK: [[PRED_STORE_IF17]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META12]], !noalias [[META15]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]] ; CHECK: [[PRED_STORE_CONTINUE18]]: -; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META14]] +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -273,7 +273,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[EC:%.*]] = icmp slt i32 [[IV]], [[SUB]] ; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_LATCH]], label %[[EXIT:.*]] ; CHECK: [[LOOP_LATCH]]: -; CHECK-NEXT: br label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br label %[[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: @@ -315,45 +315,55 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-LABEL: define void @test_phi_in_latch_redundant( ; CHECK-SAME: ptr [[DST:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 37, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 37, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = xor [[BROADCAST_SPLAT]], splat (i32 -1) -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 9) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 9, [[TMP5]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP4]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = xor [[BROADCAST_SPLAT]], splat (i32 -1) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 9) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP10]], [[TMP11]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 37, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 9, [[TMP5]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult [[TMP11]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP13:%.*]] = select [[TMP12]], splat (i1 true), zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP12]], zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[TMP19]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[PREDPHI]], align 4 [[TMP16]], [[TMP15]], i32 [[TMP8]]) +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP17]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 37 +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 37, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[NOT_A:%.*]] = xor i32 [[A]], -1 @@ -364,7 +374,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: store i32 [[P]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 9 ; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], 322 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -400,49 +410,56 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP6]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP15]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP11:%.*]] = mul [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i8( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP16]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ult [[TMP13]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP14]], [[TMP14]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i8( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext [[TMP17]] to ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], [[TMP19]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP18]], align 4 [[TMP19]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ] ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1 ; CHECK-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1 @@ -453,7 +470,7 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -481,26 +498,27 @@ exit: attributes #0 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"} -; CHECK: [[META7]] = !{[[META8:![0-9]+]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; CHECK: [[META11]] = !{[[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]} -; CHECK: [[META13]] = distinct !{[[META13]], !"LVerDomain"} -; CHECK: [[META14]] = !{[[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} -; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} -; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +; CHECK: [[META5]] = !{[[META6:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], !"LVerDomain"} +; CHECK: [[META8]] = !{[[META9:![0-9]+]]} +; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META3]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; CHECK: [[META12]] = !{[[META13:![0-9]+]]} +; CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], !"LVerDomain"} +; CHECK: [[META15]] = !{[[META16:![0-9]+]]} +; CHECK: [[META16]] = distinct !{[[META16]], [[META14]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META3]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META3]], [[META1]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll index db3215a6d2d3d..7e489268ad750 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -13,15 +13,14 @@ target triple = "riscv64" define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK-LABEL: @vector_add( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -29,28 +28,31 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP9:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP9]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -74,44 +76,47 @@ for.end: define i64 @vector_add_reduce(ptr noalias nocapture %a) { ; CHECK-LABEL: @vector_add_reduce( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: [[TMP12:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[TMP12]], [[VEC_PHI]], i32 [[TMP8]]) +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index f02e5de8950b4..5d92081203fab 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -10,15 +10,14 @@ target triple = "riscv64" define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_udiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -26,28 +25,31 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP9:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP9]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -61,14 +63,10 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: @@ -108,15 +106,14 @@ for.end: define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_sdiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -124,28 +121,31 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP9]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -159,14 +159,10 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: @@ -206,15 +202,14 @@ for.end: define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_urem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -222,28 +217,31 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP9:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP9]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[DIVREM:%.*]] = urem i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -257,14 +255,10 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: @@ -304,15 +298,14 @@ for.end: define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_srem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -320,28 +313,31 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP9:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP9]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[DIVREM:%.*]] = srem i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -355,14 +351,10 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: @@ -402,40 +394,47 @@ for.end: define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_udiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP6]], [[BROADCAST_SPLAT]], splat (i64 1) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i64 1) ; CHECK-NEXT: [[TMP11:%.*]] = udiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP11]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 @@ -448,7 +447,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -464,16 +463,11 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]] ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 -; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP4]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: @@ -525,40 +519,47 @@ for.end: define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_sdiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP6]], [[BROADCAST_SPLAT]], splat (i64 1) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i64 1) ; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP11]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 @@ -571,7 +572,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -587,16 +588,11 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]] +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]] ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8 -; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP4]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: @@ -648,37 +644,44 @@ for.end: define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_udiv_by_constant( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = udiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = select [[TMP15]], [[TMP9]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP12]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[ELEM]], 42 @@ -691,7 +694,7 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -703,18 +706,12 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) -; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: @@ -766,37 +763,44 @@ for.end: define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_sdiv_by_constant( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i64 42) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], splat (i64 42) ; CHECK-NEXT: [[TMP10:%.*]] = sdiv [[WIDE_LOAD]], splat (i64 27) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = select [[TMP15]], [[TMP9]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP12]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[ELEM]], 42 @@ -809,7 +813,7 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -821,18 +825,12 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42) +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42) -; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], splat (i64 27) ; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27) -; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]] ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP1]], align 8 -; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP1]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FIXED: middle.block: @@ -884,38 +882,45 @@ for.end: define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_sdiv_by_minus_one( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i8 -128) -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP9]], splat (i8 -1), splat (i8 1) +; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], splat (i8 -1), splat (i8 1) ; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP7]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP11]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[PREDPHI]], ptr align 1 [[TMP7]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[ELEM]], -128 @@ -928,7 +933,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: store i8 [[PHI]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -940,20 +945,13 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1 -; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], splat (i8 -128) +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 ; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], splat (i8 -128) -; FIXED-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 -1), <32 x i8> splat (i8 1) ; FIXED-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> splat (i8 -1), <32 x i8> splat (i8 1) -; FIXED-NEXT: [[TMP8:%.*]] = sdiv <32 x i8> [[WIDE_LOAD]], [[TMP6]] ; FIXED-NEXT: [[TMP9:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP7]] -; FIXED-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> [[TMP8]], <32 x i8> [[WIDE_LOAD]] ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP9]], <32 x i8> [[WIDE_LOAD1]] -; FIXED-NEXT: store <32 x i8> [[PREDPHI]], ptr [[TMP1]], align 1 -; FIXED-NEXT: store <32 x i8> [[PREDPHI2]], ptr [[TMP3]], align 1 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; FIXED-NEXT: store <32 x i8> [[PREDPHI2]], ptr [[TMP1]], align 1 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; FIXED: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll index ba6adc3cbf474..22a5a0c7b7b6d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll @@ -26,37 +26,39 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; ZVFHMIN-LABEL: define void @fadd( ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; ZVFHMIN-NEXT: [[ENTRY:.*]]: -; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; ZVFHMIN-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8 -; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]] -; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFHMIN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; ZVFHMIN: [[VECTOR_PH]]: ; ZVFHMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]] -; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFHMIN-NEXT: [[TMP3:%.*]] = sub i64 [[TMP10]], 1 +; ZVFHMIN-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP3]] +; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; ZVFHMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 8 ; ZVFHMIN-NEXT: br label %[[VECTOR_BODY:.*]] ; ZVFHMIN: [[VECTOR_BODY]]: -; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; ZVFHMIN-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] ; ZVFHMIN-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] -; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 2 -; ZVFHMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP2]], align 2 +; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP1]], splat (i1 true), i32 [[TMP6]]) +; ZVFHMIN-NEXT: [[WIDE_LOAD1:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP2]], splat (i1 true), i32 [[TMP6]]) ; ZVFHMIN-NEXT: [[TMP11:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; ZVFHMIN-NEXT: store [[TMP11]], ptr [[TMP1]], align 2 -; ZVFHMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; ZVFHMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; ZVFHMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ZVFHMIN-NEXT: call void @llvm.vp.store.nxv8f16.p0( [[TMP11]], ptr align 2 [[TMP1]], splat (i1 true), i32 [[TMP6]]) +; ZVFHMIN-NEXT: [[TMP13:%.*]] = zext i32 [[TMP6]] to i64 +; ZVFHMIN-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[INDEX]] +; ZVFHMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; ZVFHMIN-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; ZVFHMIN-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; ZVFHMIN: [[MIDDLE_BLOCK]]: -; ZVFHMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; ZVFHMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFHMIN-NEXT: br label %[[EXIT:.*]] ; ZVFHMIN: [[SCALAR_PH]]: -; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; ZVFHMIN-NEXT: br label %[[LOOP:.*]] ; ZVFHMIN: [[LOOP]]: -; ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] ; ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]] ; ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]] ; ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2 @@ -65,7 +67,7 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; ZVFHMIN-NEXT: store half [[Z]], ptr [[A_GEP]], align 2 ; ZVFHMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; ZVFHMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] -; ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; ZVFHMIN: [[EXIT]]: ; ZVFHMIN-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll index c9ba2af92df7e..8657e2f25af05 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll @@ -12,7 +12,7 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP13]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -75,7 +75,7 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP1]]) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -161,7 +161,7 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP13]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -224,7 +224,7 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP1]]) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -310,7 +310,7 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP13]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -373,7 +373,7 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP1]]) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -459,7 +459,7 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP13]]) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -522,7 +522,7 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 ; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 15, i64 [[TMP1]]) ; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] ; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: @@ -606,11 +606,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[INPUT23:%.*]] = ptrtoint ptr [[INPUT2]] to i64 ; CHECK-NEXT: [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64 ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 @@ -625,28 +621,33 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 4096, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 4096, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP2]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP4]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP4]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP17:%.*]] = call @llvm.minimumnum.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD5]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: store [[TMP17]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8f16.p0( [[TMP17]], ptr align 2 [[TMP7]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -659,7 +660,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -669,11 +670,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[INPUT23:%.*]] = ptrtoint ptr [[INPUT2]] to i64 ; ZVFHMIN-NEXT: [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64 ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 -; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) -; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] -; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; ZVFHMIN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: ; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 @@ -688,28 +685,33 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN: [[VECTOR_PH]]: ; ZVFHMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]] -; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]] +; ZVFHMIN-NEXT: [[TMP14:%.*]] = sub i64 [[TMP10]], 1 +; ZVFHMIN-NEXT: [[N_RND_UP:%.*]] = add i64 4096, [[TMP14]] +; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; ZVFHMIN-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 ; ZVFHMIN-NEXT: br label %[[VECTOR_BODY:.*]] ; ZVFHMIN: [[VECTOR_BODY]]: -; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[AVL:%.*]] = phi i64 [ 4096, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; ZVFHMIN-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 2 +; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP13]], splat (i1 true), i32 [[TMP19]]) ; ZVFHMIN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP15]], align 2 +; ZVFHMIN-NEXT: [[WIDE_LOAD5:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP15]], splat (i1 true), i32 [[TMP19]]) ; ZVFHMIN-NEXT: [[TMP17:%.*]] = call @llvm.minimumnum.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD5]]) ; ZVFHMIN-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: store [[TMP17]], ptr [[TMP18]], align 2 -; ZVFHMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; ZVFHMIN-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVFHMIN-NEXT: call void @llvm.vp.store.nxv8f16.p0( [[TMP17]], ptr align 2 [[TMP18]], splat (i1 true), i32 [[TMP19]]) +; ZVFHMIN-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64 +; ZVFHMIN-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; ZVFHMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; ZVFHMIN-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 4096 ; ZVFHMIN-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; ZVFHMIN: [[MIDDLE_BLOCK]]: -; ZVFHMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]] -; ZVFHMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFHMIN-NEXT: br label %[[EXIT:.*]] ; ZVFHMIN: [[SCALAR_PH]]: -; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; ZVFHMIN-NEXT: br label %[[FOR_BODY:.*]] ; ZVFHMIN: [[FOR_BODY]]: ; ZVFHMIN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -722,7 +724,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; ZVFHMIN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; ZVFHMIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 4096 -; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; ZVFHMIN: [[EXIT]]: ; ZVFHMIN-NEXT: ret void ; @@ -755,11 +757,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[INPUT23:%.*]] = ptrtoint ptr [[INPUT2]] to i64 ; CHECK-NEXT: [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64 ; CHECK-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8 @@ -774,28 +772,33 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 4096, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 4096, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP2]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP4]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP4]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP17:%.*]] = call @llvm.maximumnum.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD5]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]] -; CHECK-NEXT: store [[TMP17]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8f16.p0( [[TMP17]], ptr align 2 [[TMP7]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -808,7 +811,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -818,11 +821,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[INPUT23:%.*]] = ptrtoint ptr [[INPUT2]] to i64 ; ZVFHMIN-NEXT: [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64 ; ZVFHMIN-NEXT: [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64 -; ZVFHMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; ZVFHMIN-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; ZVFHMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) -; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]] -; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; ZVFHMIN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; ZVFHMIN: [[VECTOR_MEMCHECK]]: ; ZVFHMIN-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 @@ -837,28 +836,33 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN: [[VECTOR_PH]]: ; ZVFHMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8 -; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]] -; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]] +; ZVFHMIN-NEXT: [[TMP14:%.*]] = sub i64 [[TMP10]], 1 +; ZVFHMIN-NEXT: [[N_RND_UP:%.*]] = add i64 4096, [[TMP14]] +; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; ZVFHMIN-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; ZVFHMIN-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 ; ZVFHMIN-NEXT: br label %[[VECTOR_BODY:.*]] ; ZVFHMIN: [[VECTOR_BODY]]: -; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[AVL:%.*]] = phi i64 [ 4096, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; ZVFHMIN-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 2 +; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP13]], splat (i1 true), i32 [[TMP19]]) ; ZVFHMIN-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP15]], align 2 +; ZVFHMIN-NEXT: [[WIDE_LOAD5:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 2 [[TMP15]], splat (i1 true), i32 [[TMP19]]) ; ZVFHMIN-NEXT: [[TMP17:%.*]] = call @llvm.maximumnum.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD5]]) ; ZVFHMIN-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]] -; ZVFHMIN-NEXT: store [[TMP17]], ptr [[TMP18]], align 2 -; ZVFHMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; ZVFHMIN-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; ZVFHMIN-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; ZVFHMIN-NEXT: call void @llvm.vp.store.nxv8f16.p0( [[TMP17]], ptr align 2 [[TMP18]], splat (i1 true), i32 [[TMP19]]) +; ZVFHMIN-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64 +; ZVFHMIN-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; ZVFHMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; ZVFHMIN-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 4096 +; ZVFHMIN-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; ZVFHMIN: [[MIDDLE_BLOCK]]: -; ZVFHMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]] -; ZVFHMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFHMIN-NEXT: br label %[[EXIT:.*]] ; ZVFHMIN: [[SCALAR_PH]]: -; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; ZVFHMIN-NEXT: br label %[[FOR_BODY:.*]] ; ZVFHMIN: [[FOR_BODY]]: ; ZVFHMIN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -871,7 +875,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; ZVFHMIN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; ZVFHMIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 4096 -; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; ZVFHMIN: [[EXIT]]: ; ZVFHMIN-NEXT: ret void ; @@ -907,10 +911,11 @@ declare half @llvm.maximumnum.f16(half, half) ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META11:![0-9]+]], [[META2]]} +; CHECK: [[META11]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META11]], [[META2]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. ; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -922,8 +927,9 @@ declare half @llvm.maximumnum.f16(half, half) ; ZVFHMIN: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} ; ZVFHMIN: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; ZVFHMIN: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; ZVFHMIN: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; ZVFHMIN: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} -; ZVFHMIN: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; ZVFHMIN: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; ZVFHMIN: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META11:![0-9]+]], [[META2]]} +; ZVFHMIN: [[META11]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; ZVFHMIN: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} +; ZVFHMIN: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META11]], [[META2]]} +; ZVFHMIN: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll index 1ea70b6d342f8..ae18c636a7e59 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll @@ -3,8 +3,8 @@ ; CHECK: LV: Loop hints: force=enabled ; CHECK: LV: Scalar loop costs: 4. -; ChosenFactor.Cost is 4, but the real cost will be divided by the width, which is 2. -; CHECK: Cost for VF 2: 4 (Estimated cost per lane: 2.0) +; ChosenFactor.Cost is 9, but the real cost will be divided by the width, which is 2.2. +; CHECK: Cost for VF vscale x 2: 9 (Estimated cost per lane: 2.2) ; Regardless of force vectorization or not, this loop will eventually be vectorized because of the cost model. ; Therefore, the following message does not need to be printed even if vectorization is explicitly forced in the metadata. ; CHECK-NOT: LV: Vectorization seems to be not beneficial, but was forced by a user. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll index e6825faf3f8d5..60ea48e3e4990 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll @@ -20,11 +20,7 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP2]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UMIN21]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 128, i64 [[TMP7]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP8]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP31:%.*]] = shl nsw i64 [[X_I64]], 1 ; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP31]] @@ -59,40 +55,40 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP46:%.*]] = mul nuw i64 [[TMP45]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]] +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[TMP46]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP5]], [[TMP22]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP46]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 8 -; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[N_VEC]], 3 -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]] -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3 -; CHECK-NEXT: [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]] ; CHECK-NEXT: [[TMP53:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[X_I64]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP55:%.*]] = mul [[TMP53]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP55]] -; CHECK-NEXT: [[TMP58:%.*]] = mul i64 3, [[TMP52]] -; CHECK-NEXT: [[DOTSPLATINSERT24:%.*]] = insertelement poison, i64 [[TMP58]], i64 0 -; CHECK-NEXT: [[DOTSPLAT25:%.*]] = shufflevector [[DOTSPLATINSERT24]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP5]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[TMP58:%.*]] = mul i64 3, [[TMP28]] +; CHECK-NEXT: [[DOTSPLATINSERT24:%.*]] = insertelement poison, i64 [[TMP58]], i64 0 +; CHECK-NEXT: [[DOTSPLAT25:%.*]] = shufflevector [[DOTSPLATINSERT24]], poison, zeroinitializer ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i16, ptr [[A]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i16.nxv8p0( zeroinitializer, [[TMP59]], i32 2, splat (i1 true)), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP52]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP59]], splat (i1 true), i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP47]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP47]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT25]] -; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[ENTRY]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[ENTRY]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[X_I64]], %[[ENTRY]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[X_I32]], %[[ENTRY]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -107,7 +103,7 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 ; CHECK-NEXT: [[TMP64]] = trunc i64 [[IV_NEXT]] to i32 ; CHECK-NEXT: [[C:%.*]] = icmp slt i64 [[IV]], 99 -; CHECK-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -144,8 +140,9 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } ; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]]} ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} ; CHECK: [[META5]] = distinct !{[[META5]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]} ; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]} +; CHECK: [[META8]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index e226eeac9e95f..ad25a5f50c574 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -6,41 +6,49 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor2_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i32 1) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 2) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP11]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -53,7 +61,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -102,41 +110,49 @@ define void @load_store_factor2_i32(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor2_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP12]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i32 1) ; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 2) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP11]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -149,7 +165,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -180,41 +196,49 @@ exit: define void @load_store_factor2_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor2_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP20]], splat (i64 1) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP10]], [[TMP11]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP13]], [[TMP11]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -227,7 +251,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -276,41 +300,49 @@ define void @load_store_factor2_i64(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor2_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP12]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP20:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP20]], splat (i64 1) ; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP10]], [[TMP11]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP13]], [[TMP11]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -323,7 +355,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -354,24 +386,30 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP15]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 2 @@ -379,18 +417,20 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i32 2) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i32 3) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv12i32( [[TMP11]], [[TMP12]], [[TMP13]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -408,7 +448,7 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -466,24 +506,30 @@ define void @load_store_factor3_i32(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP15:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP15]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 -; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 4 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3 +; SCALABLE-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 2 @@ -491,18 +537,20 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i32 2) ; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i32 3) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv12i32( [[TMP11]], [[TMP12]], [[TMP13]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -520,7 +568,7 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -557,43 +605,51 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP15]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP23]], splat (i64 1) ; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i64 3) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP11]], [[TMP12]], [[TMP13]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP25]], [[TMP12]], [[TMP13]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -611,7 +667,7 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -669,43 +725,51 @@ define void @load_store_factor3_i64(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP15:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP15]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 -; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3 +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP23:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 2 -; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP23]], splat (i64 1) ; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i64 2) ; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i64 3) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP11]], [[TMP12]], [[TMP13]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP25]], [[TMP12]], [[TMP13]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -723,7 +787,7 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -760,45 +824,53 @@ exit: define void @load_store_factor4(ptr %p) { ; CHECK-LABEL: @load_store_factor4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP18]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP10]], splat (i64 1) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP24]], splat (i64 1) ; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) ; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP12]], splat (i64 3) ; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP13]], splat (i64 4) -; CHECK-NEXT: [[INTERLEAVED_VEC4:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP26]], [[TMP15]], [[TMP16]], [[TMP17]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 4 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -821,7 +893,7 @@ define void @load_store_factor4(ptr %p) { ; CHECK-NEXT: store i64 [[Y3]], ptr [[Q3]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -886,45 +958,53 @@ define void @load_store_factor4(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor4( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP18:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP18]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP24:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 ; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP10]], splat (i64 1) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP24]], splat (i64 1) ; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) ; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP12]], splat (i64 3) ; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP13]], splat (i64 4) -; SCALABLE-NEXT: [[INTERLEAVED_VEC4:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP26]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; SCALABLE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 4 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -947,7 +1027,7 @@ define void @load_store_factor4(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y3]], ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -990,44 +1070,53 @@ exit: define void @load_store_factor5(ptr %p) { ; CHECK-LABEL: @load_store_factor5( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP5]], splat (i64 1) -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP6]], splat (i64 2) -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP7]], splat (i64 3) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP8]], splat (i64 4) -; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP9]], splat (i64 5) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP10]], [[TMP11]], [[TMP12]], [[TMP13]], [[TMP14]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv1i32() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP9]], splat (i64 2) +; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP10]], splat (i64 3) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP11]], splat (i64 4) +; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP12]], splat (i64 5) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 5 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1055,7 +1144,7 @@ define void @load_store_factor5(ptr %p) { ; CHECK-NEXT: store i64 [[Y4]], ptr [[Q4]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1129,44 +1218,53 @@ define void @load_store_factor5(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor5( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 5 -; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 2 -; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 3 -; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 4 -; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP5]], splat (i64 1) -; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP6]], splat (i64 2) -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP7]], splat (i64 3) -; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP8]], splat (i64 4) -; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP9]], splat (i64 5) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP10]], [[TMP11]], [[TMP12]], [[TMP13]], [[TMP14]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv1i32() +; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5 +; SCALABLE-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 2 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 3 +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 4 +; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP9]], splat (i64 2) +; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP10]], splat (i64 3) +; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP11]], splat (i64 4) +; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP12]], splat (i64 5) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] +; SCALABLE-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 5 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1194,7 +1292,7 @@ define void @load_store_factor5(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y4]], ptr [[Q4]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1243,46 +1341,55 @@ exit: define void @load_store_factor6(ptr %p) { ; CHECK-LABEL: @load_store_factor6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 5 -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP5]], splat (i64 1) -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP6]], splat (i64 2) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP7]], splat (i64 3) -; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP8]], splat (i64 4) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP9]], splat (i64 5) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP10]], splat (i64 6) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP11]], [[TMP12]], [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.stepvector.nxv1i32() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP20]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 5 +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP9]], splat (i64 2) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP10]], splat (i64 3) +; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP11]], splat (i64 4) +; CHECK-NEXT: [[TMP18:%.*]] = add [[TMP12]], splat (i64 5) +; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP13]], splat (i64 6) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 6 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1315,7 +1422,7 @@ define void @load_store_factor6(ptr %p) { ; CHECK-NEXT: store i64 [[Y5]], ptr [[Q5]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1397,46 +1504,55 @@ define void @load_store_factor6(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor6( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 6 -; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 2 -; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 3 -; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 4 -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 5 -; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP5]], splat (i64 1) -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP6]], splat (i64 2) -; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP7]], splat (i64 3) -; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP8]], splat (i64 4) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP9]], splat (i64 5) -; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP10]], splat (i64 6) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP11]], [[TMP12]], [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP20:%.*]] = call @llvm.stepvector.nxv1i32() +; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP20]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6 +; SCALABLE-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 2 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 3 +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 4 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 5 +; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP9]], splat (i64 2) +; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP10]], splat (i64 3) +; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP11]], splat (i64 4) +; SCALABLE-NEXT: [[TMP18:%.*]] = add [[TMP12]], splat (i64 5) +; SCALABLE-NEXT: [[TMP19:%.*]] = add [[TMP13]], splat (i64 6) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]] +; SCALABLE-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 6 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1469,7 +1585,7 @@ define void @load_store_factor6(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y5]], ptr [[Q5]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1524,48 +1640,57 @@ exit: define void @load_store_factor7(ptr %p) { ; CHECK-LABEL: @load_store_factor7( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 5 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 6 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP5]], splat (i64 1) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP6]], splat (i64 2) -; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP7]], splat (i64 3) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP8]], splat (i64 4) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP9]], splat (i64 5) -; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP10]], splat (i64 6) -; CHECK-NEXT: [[TMP18:%.*]] = add [[TMP11]], splat (i64 7) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP12]], [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.stepvector.nxv1i32() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP22]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 5 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 6 +; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP9]], splat (i64 2) +; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP10]], splat (i64 3) +; CHECK-NEXT: [[TMP18:%.*]] = add [[TMP11]], splat (i64 4) +; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP12]], splat (i64 5) +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP13]], splat (i64 6) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP14]], splat (i64 7) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 7 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1603,7 +1728,7 @@ define void @load_store_factor7(ptr %p) { ; CHECK-NEXT: store i64 [[Y6]], ptr [[Q6]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1694,48 +1819,57 @@ define void @load_store_factor7(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor7( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP3:%.*]] = sub i64 [[TMP4]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP3]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 7 -; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 2 -; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 3 -; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 4 -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 5 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 6 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP5]], splat (i64 1) -; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP6]], splat (i64 2) -; SCALABLE-NEXT: [[TMP14:%.*]] = add [[TMP7]], splat (i64 3) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP8]], splat (i64 4) -; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP9]], splat (i64 5) -; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP10]], splat (i64 6) -; SCALABLE-NEXT: [[TMP18:%.*]] = add [[TMP11]], splat (i64 7) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP12]], [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; SCALABLE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP22:%.*]] = call @llvm.stepvector.nxv1i32() +; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP22]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7 +; SCALABLE-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 2 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 3 +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 4 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 5 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 6 +; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP9]], splat (i64 2) +; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP10]], splat (i64 3) +; SCALABLE-NEXT: [[TMP18:%.*]] = add [[TMP11]], splat (i64 4) +; SCALABLE-NEXT: [[TMP19:%.*]] = add [[TMP12]], splat (i64 5) +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP13]], splat (i64 6) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP14]], splat (i64 7) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]] +; SCALABLE-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 7 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1773,7 +1907,7 @@ define void @load_store_factor7(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y6]], ptr [[Q6]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1834,50 +1968,59 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP36:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP36]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 5 -; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 6 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 7 -; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP11]], splat (i64 1) -; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 2) -; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 3) -; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 4) -; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 5) -; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 6) -; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 7) -; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 8) -; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]], [[TMP24]], [[TMP25]], [[TMP26]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv1i32() +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 5 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 6 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 7 +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP8]], splat (i64 1) +; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP9]], splat (i64 2) +; CHECK-NEXT: [[TMP18:%.*]] = add [[TMP10]], splat (i64 3) +; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP11]], splat (i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 5) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 6) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 7) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1920,7 +2063,7 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -2017,50 +2160,59 @@ define void @load_store_factor8(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP36:%.*]] = sub i64 [[TMP1]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP36]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP3:%.*]] = shl i64 [[INDEX]], 3 -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 2 -; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 3 -; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 4 -; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 5 -; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 6 -; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 7 -; SCALABLE-NEXT: [[TMP19:%.*]] = add [[TMP11]], splat (i64 1) -; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 2) -; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 3) -; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 4) -; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 5) -; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 6) -; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 7) -; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 8) -; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]], [[TMP24]], [[TMP25]], [[TMP26]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] -; SCALABLE-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv1i32() +; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP4]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3 +; SCALABLE-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_MASKED_VEC]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 2 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 3 +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 4 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 5 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 6 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 7 +; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP8]], splat (i64 1) +; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP9]], splat (i64 2) +; SCALABLE-NEXT: [[TMP18:%.*]] = add [[TMP10]], splat (i64 3) +; SCALABLE-NEXT: [[TMP19:%.*]] = add [[TMP11]], splat (i64 4) +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 5) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 6) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 7) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]]) +; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]] +; SCALABLE-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -2103,7 +2255,7 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -2170,40 +2322,47 @@ exit: define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @combine_load_factor2_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP14]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP12]], [[TMP12]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP10]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP10]], ptr align 4 [[TMP11]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -2215,7 +2374,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -2226,24 +2385,15 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP2]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4 -; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 +; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] ; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP7]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP5]], ptr [[TMP7]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP7]], align 4 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; FIXED: middle.block: @@ -2270,40 +2420,47 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; ; SCALABLE-LABEL: @combine_load_factor2_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP14:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP14]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 4 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP12]], [[TMP12]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP8]], [[TMP9]] ; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[TMP10]], ptr [[TMP11]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP10]], ptr align 4 [[TMP11]], splat (i1 true), i32 [[TMP7]]) +; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -2315,7 +2472,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -2347,40 +2504,47 @@ exit: define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: @combine_load_factor2_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP14]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP12]], [[TMP12]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP10]], ptr [[TMP11]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP10]], ptr align 8 [[TMP11]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -2392,7 +2556,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -2403,24 +2567,15 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP2]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8 -; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 +; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] ; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] ; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP7]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP5]], ptr [[TMP7]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP7]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; FIXED: middle.block: @@ -2447,40 +2602,47 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; ; SCALABLE-LABEL: @combine_load_factor2_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP14:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP14]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() +; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]] +; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP12]], [[TMP12]]) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP8]], [[TMP9]] ; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[TMP10]], ptr [[TMP11]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP10]], ptr align 8 [[TMP11]], splat (i1 true), i32 [[TMP7]]) +; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: br label [[EXIT:%.*]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -2492,7 +2654,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 ; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll index 056dc7eea2a09..ee91f75fc9706 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll @@ -7,22 +7,31 @@ ; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf6-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF6 ; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf7-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF7 ; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf8-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF8 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,-optimized-nf2-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-NO-OPT +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF2 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf3-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF3 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf4-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF4 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf5-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF5 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf6-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF6 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf7-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF7 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf8-segment-load-store -scalable-vectorization=off -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FIXED-OPT-NF8 %i8.2 = type {i8, i8} define void @i8_factor_2(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF2-LABEL: Checking a loop in 'i8_factor_2' +; FIXED-OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; OPT-NF2-LABEL: Checking a loop in 'i8_factor_2' -; OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; OPT-NF2: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> ; OPT-NF2: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; OPT-NF2: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> @@ -33,17 +42,18 @@ entry: ; OPT-NF2: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; OPT-NF2: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> ; OPT-NF2: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_2' +; FIXED-NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_2' -; NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; NO-OPT: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> ; NO-OPT: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> ; NO-OPT: Cost of 8 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> @@ -76,17 +86,18 @@ for.end: define void @i8_factor_3(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF3-LABEL: Checking a loop in 'i8_factor_3' +; FIXED-OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; OPT-NF3-LABEL: Checking a loop in 'i8_factor_3' -; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; OPT-NF3: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> ; OPT-NF3: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; OPT-NF3: Cost of 4 for VF vscale x 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> @@ -97,17 +108,18 @@ entry: ; OPT-NF3: Cost of 7 for VF vscale x 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; OPT-NF3: Cost of 14 for VF vscale x 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> ; OPT-NF3: Cost of 14 for VF vscale x 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_3' +; FIXED-NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; FIXED-NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_3' -; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; NO-OPT: Cost of 6 for VF vscale x 1: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> ; NO-OPT: Cost of 6 for VF vscale x 1: INTERLEAVE-GROUP with factor 3 at , ir<%p0> ; NO-OPT: Cost of 12 for VF vscale x 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> @@ -144,17 +156,18 @@ for.end: define void @i8_factor_4(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF4-LABEL: Checking a loop in 'i8_factor_4' +; FIXED-OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; OPT-NF4-LABEL: Checking a loop in 'i8_factor_4' -; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; OPT-NF4: Cost of 5 for VF vscale x 1: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> ; OPT-NF4: Cost of 5 for VF vscale x 1: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; OPT-NF4: Cost of 5 for VF vscale x 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> @@ -165,17 +178,18 @@ entry: ; OPT-NF4: Cost of 8 for VF vscale x 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; OPT-NF4: Cost of 16 for VF vscale x 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> ; OPT-NF4: Cost of 16 for VF vscale x 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_4' +; FIXED-NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; FIXED-NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_4' -; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; NO-OPT: Cost of 8 for VF vscale x 1: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> ; NO-OPT: Cost of 8 for VF vscale x 1: INTERLEAVE-GROUP with factor 4 at , ir<%p0> ; NO-OPT: Cost of 16 for VF vscale x 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> @@ -216,15 +230,16 @@ for.end: define void @i8_factor_5(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF5-LABEL: Checking a loop in 'i8_factor_5' +; FIXED-OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; OPT-NF5-LABEL: Checking a loop in 'i8_factor_5' -; OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; OPT-NF5: Cost of 6 for VF vscale x 1: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> ; OPT-NF5: Cost of 6 for VF vscale x 1: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; OPT-NF5: Cost of 7 for VF vscale x 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> @@ -233,15 +248,16 @@ entry: ; OPT-NF5: Cost of 9 for VF vscale x 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; OPT-NF5: Cost of 13 for VF vscale x 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> ; OPT-NF5: Cost of 13 for VF vscale x 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_5' +; FIXED-NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; FIXED-NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_5' -; NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; NO-OPT: Cost of 10 for VF vscale x 1: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> ; NO-OPT: Cost of 10 for VF vscale x 1: INTERLEAVE-GROUP with factor 5 at , ir<%p0> ; NO-OPT: Cost of 20 for VF vscale x 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> @@ -284,15 +300,16 @@ for.end: define void @i8_factor_6(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF6-LABEL: Checking a loop in 'i8_factor_6' +; FIXED-OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; OPT-NF6-LABEL: Checking a loop in 'i8_factor_6' -; OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; OPT-NF6: Cost of 7 for VF vscale x 1: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> ; OPT-NF6: Cost of 7 for VF vscale x 1: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; OPT-NF6: Cost of 8 for VF vscale x 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> @@ -301,15 +318,16 @@ entry: ; OPT-NF6: Cost of 10 for VF vscale x 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; OPT-NF6: Cost of 14 for VF vscale x 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> ; OPT-NF6: Cost of 14 for VF vscale x 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_6' +; FIXED-NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; FIXED-NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_6' -; NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; NO-OPT: Cost of 12 for VF vscale x 1: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> ; NO-OPT: Cost of 12 for VF vscale x 1: INTERLEAVE-GROUP with factor 6 at , ir<%p0> ; NO-OPT: Cost of 24 for VF vscale x 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> @@ -356,15 +374,16 @@ for.end: define void @i8_factor_7(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF7-LABEL: Checking a loop in 'i8_factor_7' +; FIXED-OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; OPT-NF7-LABEL: Checking a loop in 'i8_factor_7' -; OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; OPT-NF7: Cost of 8 for VF vscale x 1: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> ; OPT-NF7: Cost of 8 for VF vscale x 1: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; OPT-NF7: Cost of 9 for VF vscale x 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> @@ -373,15 +392,16 @@ entry: ; OPT-NF7: Cost of 11 for VF vscale x 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; OPT-NF7: Cost of 15 for VF vscale x 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> ; OPT-NF7: Cost of 15 for VF vscale x 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_7' +; FIXED-NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; FIXED-NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_7' -; NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; NO-OPT: Cost of 14 for VF vscale x 1: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> ; NO-OPT: Cost of 14 for VF vscale x 1: INTERLEAVE-GROUP with factor 7 at , ir<%p0> ; NO-OPT: Cost of 28 for VF vscale x 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> @@ -432,15 +452,16 @@ for.end: define void @i8_factor_8(ptr %data, i64 %n) { entry: br label %for.body +; FIXED-OPT-NF8-LABEL: Checking a loop in 'i8_factor_8' +; FIXED-OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; OPT-NF8-LABEL: Checking a loop in 'i8_factor_8' -; OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; OPT-NF8: Cost of 9 for VF vscale x 1: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> ; OPT-NF8: Cost of 9 for VF vscale x 1: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; OPT-NF8: Cost of 10 for VF vscale x 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> @@ -449,15 +470,16 @@ entry: ; OPT-NF8: Cost of 12 for VF vscale x 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; OPT-NF8: Cost of 16 for VF vscale x 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> ; OPT-NF8: Cost of 16 for VF vscale x 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-NO-OPT-LABEL: Checking a loop in 'i8_factor_8' +; FIXED-NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; FIXED-NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; FIXED-NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; NO-OPT-LABEL: Checking a loop in 'i8_factor_8' -; NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; NO-OPT: Cost of 16 for VF vscale x 1: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> ; NO-OPT: Cost of 16 for VF vscale x 1: INTERLEAVE-GROUP with factor 8 at , ir<%p0> ; NO-OPT: Cost of 32 for VF vscale x 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll index 93e0f9038361d..946cb95aed2ec 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -46,124 +46,130 @@ define void @load_store(ptr %p) { ; ; LMUL2-LABEL: @load_store( ; LMUL2-NEXT: entry: -; LMUL2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; LMUL2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL2: vector.ph: ; LMUL2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; LMUL2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; LMUL2-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; LMUL2-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL2-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2 ; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL2: vector.body: ; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL2-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL2-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]] -; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 +; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) ; LMUL2-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], splat (i64 1) -; LMUL2-NEXT: store [[TMP7]], ptr [[TMP5]], align 8 -; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; LMUL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL2-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP7]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) +; LMUL2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; LMUL2-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; LMUL2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; LMUL2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL2: middle.block: -; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; LMUL2-NEXT: br label [[FOR_END:%.*]] ; LMUL2: scalar.ph: -; LMUL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; LMUL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; LMUL2-NEXT: br label [[FOR_BODY:%.*]] ; LMUL2: for.body: -; LMUL2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; LMUL2-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL2-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] ; LMUL2-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL2-NEXT: [[W:%.*]] = add i64 [[V]], 1 ; LMUL2-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; LMUL2: for.end: ; LMUL2-NEXT: ret void ; ; LMUL4-LABEL: @load_store( ; LMUL4-NEXT: entry: -; LMUL4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; LMUL4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; LMUL4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL4: vector.ph: ; LMUL4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; LMUL4-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; LMUL4-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; LMUL4-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL4-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 ; LMUL4-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL4: vector.body: ; LMUL4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL4-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL4-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; LMUL4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]] -; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 +; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) ; LMUL4-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], splat (i64 1) -; LMUL4-NEXT: store [[TMP7]], ptr [[TMP5]], align 8 -; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; LMUL4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL4-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL4-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[TMP7]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) +; LMUL4-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; LMUL4-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; LMUL4-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; LMUL4-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL4: middle.block: -; LMUL4-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; LMUL4-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; LMUL4-NEXT: br label [[FOR_END:%.*]] ; LMUL4: scalar.ph: -; LMUL4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; LMUL4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; LMUL4-NEXT: br label [[FOR_BODY:%.*]] ; LMUL4: for.body: -; LMUL4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; LMUL4-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL4-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] ; LMUL4-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL4-NEXT: [[W:%.*]] = add i64 [[V]], 1 ; LMUL4-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; LMUL4: for.end: ; LMUL4-NEXT: ret void ; ; LMUL8-LABEL: @load_store( ; LMUL8-NEXT: entry: -; LMUL8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; LMUL8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; LMUL8-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL8: vector.ph: ; LMUL8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; LMUL8-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; LMUL8-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; LMUL8-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; LMUL8-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8 ; LMUL8-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL8: vector.body: ; LMUL8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL8-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL8-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]] -; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 +; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) ; LMUL8-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], splat (i64 1) -; LMUL8-NEXT: store [[TMP7]], ptr [[TMP5]], align 8 -; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; LMUL8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL8-NEXT: call void @llvm.vp.store.nxv8i64.p0( [[TMP7]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP6]]) +; LMUL8-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; LMUL8-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; LMUL8-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; LMUL8-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL8: middle.block: -; LMUL8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; LMUL8-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; LMUL8-NEXT: br label [[FOR_END:%.*]] ; LMUL8: scalar.ph: -; LMUL8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; LMUL8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; LMUL8-NEXT: br label [[FOR_BODY:%.*]] ; LMUL8: for.body: -; LMUL8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; LMUL8-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; LMUL8-NEXT: [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]] ; LMUL8-NEXT: [[V:%.*]] = load i64, ptr [[Q]], align 8 ; LMUL8-NEXT: [[W:%.*]] = add i64 [[V]], 1 ; LMUL8-NEXT: store i64 [[W]], ptr [[Q]], align 8 ; LMUL8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; LMUL8-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; LMUL8-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; LMUL8-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; LMUL8: for.end: ; LMUL8-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 32cb4261622bf..aad789c78bb0a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -185,13 +185,22 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 16, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP3]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4:%.*]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 16, i32 8, i1 true) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP1:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP4:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP11]], ptr align 1 [[TMP4]], splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -199,7 +208,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]] ; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1 @@ -239,13 +248,22 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 32, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4:%.*]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 32, i32 16, i1 true) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP1:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP4:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP11]], ptr align 1 [[TMP4]], splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -253,7 +271,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]] ; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1 @@ -292,26 +310,30 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 24, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP4]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 24, i32 16, i1 true) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[SRC:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[DST:%.*]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] +; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[TMP7]], ptr align 1 [[DST]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] ; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP8]], 1 @@ -321,7 +343,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -360,35 +382,31 @@ define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <16 x i64> [[VEC_IV]], splat (i64 9) +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 +; CHECK-NEXT: [[TMP1]] = mul <8 x i8> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> [[TMP3]]) -; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> [[TMP1]]) +; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 2, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 2, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ 2, [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[GEP]], align 1 ; CHECK-NEXT: [[MUL]] = mul i8 [[TMP5]], [[RDX]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ] ; CHECK-NEXT: ret i8 [[MUL_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll index 10ba208390c33..e5e694cbe949a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -11,15 +11,14 @@ target triple = "riscv64" define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-LABEL: @test( ; VLENUNK-NEXT: entry: -; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VLENUNK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; VLENUNK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; VLENUNK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; VLENUNK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP12]] +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VLENUNK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; VLENUNK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 @@ -27,32 +26,41 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; VLENUNK-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 1) ; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] -; VLENUNK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP5]] -; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 -; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; VLENUNK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP15]] +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; VLENUNK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i32() +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP10]], [[BROADCAST_SPLAT4]] ; VLENUNK-NEXT: [[TMP13:%.*]] = icmp ult [[VEC_IND]], splat (i64 512) +; VLENUNK-NEXT: [[TMP16:%.*]] = select [[TMP11]], [[TMP13]], zeroinitializer ; VLENUNK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, [[TMP13]], poison) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[WIDE_MASKED_LOAD]], zeroinitializer +; VLENUNK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], [[TMP13]], i32 [[TMP7]]) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[VP_OP_LOAD]], zeroinitializer ; VLENUNK-NEXT: [[TMP17:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; VLENUNK-NEXT: store [[TMP17]], ptr [[TMP18]], align 4 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; VLENUNK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP17]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP7]]) +; VLENUNK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] +; VLENUNK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: -; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VLENUNK-NEXT: br label [[FOR_END:%.*]] ; VLENUNK: scalar.ph: -; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: -; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; VLENUNK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 512 ; VLENUNK-NEXT: br i1 [[ICMP]], label [[DO_LOAD:%.*]], label [[LATCH]] ; VLENUNK: do_load: @@ -66,7 +74,7 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VLENUNK: for.end: ; VLENUNK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 6800a93ce6db9..b28d801071262 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -16,11 +16,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; RV32-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV32: vector.memcheck: ; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 ; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 @@ -36,40 +32,45 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: vector.ph: ; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] +; RV32-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1 +; RV32-NEXT: [[N_RND_UP:%.*]] = add i64 625, [[TMP2]] +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV32-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2 -; RV32-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 ; RV32-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; RV32-NEXT: [[TMP12:%.*]] = mul i64 16, [[TMP6]] -; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV32-NEXT: br label [[VECTOR_BODY:%.*]] ; RV32: vector.body: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; RV32-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; RV32-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] +; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP13]], i32 4, splat (i1 true), poison), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw [[VEC_IND]], splat (i64 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP16]], i32 8, [[TMP14]], poison), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV32-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV32-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP18]], [[TMP19]], i32 8, [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV32-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 625 +; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV32: middle.block: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; RV32-NEXT: br label [[FOR_END:%.*]] ; RV32: scalar.ph: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: ; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -89,17 +90,13 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; ; RV64-LABEL: @foo4( ; RV64-NEXT: entry: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; RV64-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV64: vector.memcheck: ; RV64-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 ; RV64-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940 @@ -115,40 +112,45 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: vector.ph: ; RV64-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] +; RV64-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1 +; RV64-NEXT: [[N_RND_UP:%.*]] = add i64 625, [[TMP2]] +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV64-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2 -; RV64-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 ; RV64-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV64-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] -; RV64-NEXT: [[TMP12:%.*]] = mul i64 16, [[TMP6]] -; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 -; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV64-NEXT: br label [[VECTOR_BODY:%.*]] ; RV64: vector.body: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; RV64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 +; RV64-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] +; RV64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP13]], i32 4, splat (i1 true), poison), !alias.scope [[META0:![0-9]+]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] ; RV64-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV64-NEXT: [[TMP15:%.*]] = shl nuw nsw [[VEC_IND]], splat (i64 1) ; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP15]] -; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP16]], i32 8, [[TMP14]], poison), !alias.scope [[META3:![0-9]+]] +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] ; RV64-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV64-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV64-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP18]], [[TMP19]], i32 8, [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; RV64-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV64-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 625 +; RV64-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV64: middle.block: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; RV64-NEXT: br label [[FOR_END:%.*]] ; RV64: scalar.ph: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; RV64-NEXT: br label [[FOR_BODY:%.*]] ; RV64: for.body: ; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -168,7 +170,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index ee6b950f9b911..10af538944d65 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 -; RUN: opt -passes=loop-vectorize -mattr=+v -S < %s | FileCheck %s --check-prefixes=CHECK,V -; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ -; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V -; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ +; RUN: opt -passes=loop-vectorize -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,V +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ +; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ + +; TODO: Remove -prefer-predicate-over-epilogue=scalar-epilogue when partial reductions with EVL tail folding is supported. target triple = "riscv64-none-unknown-elf" diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll index b5b62d0704c91..cad15d17201c9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll @@ -9,15 +9,14 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK-LABEL: define void @pr87378_vpinstruction_or_drop_poison_generating_flags( ; CHECK-SAME: ptr [[ARG:%.*]], i64 [[A:%.*]], i64 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1001, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1001, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1001, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1001, [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[A]], i64 0 @@ -29,39 +28,49 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP5]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1001, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP25]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP25]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP10]], [[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP28:%.*]] = select [[TMP11]], [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP28]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = xor [[TMP13]], splat (i1 true) -; CHECK-NEXT: [[TMP17:%.*]] = or [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP29:%.*]] = select [[TMP11]], [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or [[TMP15]], [[TMP29]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP19:%.*]] = select [[TMP17]], [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = xor [[TMP14]], splat (i1 true) -; CHECK-NEXT: [[TMP21:%.*]] = select [[TMP13]], [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = select [[TMP28]], [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = or [[TMP19]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP21]], i32 0 ; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]] -; CHECK-NEXT: call void @llvm.masked.store.nxv8i16.p0( zeroinitializer, ptr [[TMP24]], i32 2, [[TMP22]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i16.p0( zeroinitializer, ptr align 2 [[TMP24]], [[TMP22]], i32 [[TMP25]]) +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP26]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1001 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1001, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[C_1:%.*]] = icmp ule i64 [[IV]], [[A]] ; CHECK-NEXT: br i1 [[C_1]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]] ; CHECK: then.1: @@ -80,7 +89,7 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[IV]], 1000 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -118,8 +127,9 @@ exit: ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll index 554ce7b81e953..39e865bdfc678 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll @@ -9,46 +9,49 @@ define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @add( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = add [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: @@ -74,46 +77,49 @@ define i32 @or(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @or( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = or [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = or [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[OR:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[OR:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[OR]] = or i32 [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[OR_LCSSA]] ; entry: @@ -139,46 +145,49 @@ define i32 @and(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @and( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (i32 -1), i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (i32 -1), i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = and [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = and [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[AND:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[AND:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[AND]] = and i32 [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[AND_LCSSA]] ; entry: @@ -204,46 +213,49 @@ define i32 @xor(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @xor( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = xor [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = xor [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[XOR]] = xor i32 [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] ; entry: @@ -269,48 +281,51 @@ define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @smin( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 2), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 2), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp slt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[SUM_010]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[SUM_010]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -337,48 +352,51 @@ define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-LABEL: define i32 @umax( ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 2), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 2), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP11]], [[SUM_010]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[SUM_010]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -405,46 +423,49 @@ define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-LABEL: define float @fadd_fast( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = fadd fast [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4f32( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ADD_LCSSA]] ; entry: @@ -468,46 +489,49 @@ define half @fadd_fast_half_zvfh(ptr noalias nocapture readonly %a, i64 %n) "tar ; CHECK-LABEL: define half @fadd_fast_half_zvfh( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP12]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7]] = fadd fast [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv8f16( splat (i1 true), [[TMP7]], [[VEC_PHI]], i32 [[TMP13]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP9:%.*]] = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, [[TMP7]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP11:%.*]] = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, [[TMP8]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0xH0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = fadd fast half [[TMP10]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[ADD_LCSSA]] ; entry: @@ -549,7 +573,7 @@ define half @fadd_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) " ; CHECK-NEXT: [[TMP3]] = fadd fast <16 x half> [[WIDE_LOAD2]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x half> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[BIN_RDX]]) @@ -567,7 +591,7 @@ define half @fadd_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) " ; CHECK-NEXT: [[ADD]] = fadd fast half [[TMP6]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[ADD_LCSSA]] @@ -611,7 +635,7 @@ define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) "targ ; CHECK-NEXT: [[TMP3]] = fadd fast <16 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x bfloat> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> [[BIN_RDX]]) @@ -629,7 +653,7 @@ define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) "targ ; CHECK-NEXT: [[ADD]] = fadd fast bfloat [[TMP6]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret bfloat [[ADD_LCSSA]] @@ -657,48 +681,51 @@ define float @fmin_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-LABEL: define float @fmin_fast( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv4f32( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt float [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP11]], float [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -723,48 +750,51 @@ define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) # ; CHECK-LABEL: define half @fmin_fast_half_zvfhmin( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv8f16( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.vector.reduce.fmin.nxv8f16( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call half @llvm.vector.reduce.fmin.nxv8f16( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0xH0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt half [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], half [[TMP11]], half [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -789,48 +819,51 @@ define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 ; CHECK-LABEL: define bfloat @fmin_fast_bfloat_zvfbfmin( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8bf16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv8bf16( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call bfloat @llvm.vector.reduce.fmin.nxv8bf16( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call bfloat @llvm.vector.reduce.fmin.nxv8bf16( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0xR0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ 0xR0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt bfloat [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], bfloat [[TMP11]], bfloat [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret bfloat [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -857,48 +890,51 @@ define float @fmax_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-LABEL: define float @fmax_fast( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR4]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast ogt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv4f32( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt float [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP11]], float [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -923,48 +959,51 @@ define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) # ; CHECK-LABEL: define half @fmax_fast_half_zvfhmin( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR5]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast ogt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv8f16( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call fast half @llvm.vector.reduce.fmax.nxv8f16( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call fast half @llvm.vector.reduce.fmax.nxv8f16( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0xH0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt half [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], half [[TMP11]], half [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -989,48 +1028,51 @@ define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 ; CHECK-LABEL: define bfloat @fmax_fast_bfloat_zvfbfmin( ; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR6]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8bf16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast ogt [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP8]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv8bf16( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16( [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP12:%.*]] = call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16( [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0xR0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ 0xR0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt bfloat [[TMP11]], [[SUM_07]] ; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], bfloat [[TMP11]], bfloat [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret bfloat [[DOTSROA_SPECULATED_LCSSA]] ; entry: @@ -1077,7 +1119,7 @@ define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-NEXT: [[TMP3]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[BIN_RDX]]) @@ -1095,7 +1137,7 @@ define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { ; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP6]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[MUL_LCSSA]] @@ -1142,7 +1184,7 @@ define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture re ; CHECK-NEXT: [[TMP5]] = mul <8 x i32> [[WIDE_LOAD1]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -1165,7 +1207,7 @@ define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture re ; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP9]], [[SUM]] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[MUL_LCSSA]] @@ -1197,40 +1239,43 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n) { ; CHECK-LABEL: define float @fmuladd( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD]], [[WIDE_LOAD1]], [[VEC_PHI]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP8:%.*]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD]], [[WIDE_LOAD1]], [[VEC_PHI]]) +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv4f32( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP16:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] @@ -1238,9 +1283,9 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[SUM_07]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MULADD_LCSSA]] ; entry: @@ -1266,40 +1311,43 @@ define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh" ; CHECK-LABEL: define half @fmuladd_f16_zvfh( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (half 0xH8000), half 0xH0000, i32 0), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (half 0xH8000), half 0xH0000, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds half, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8]] = call reassoc @llvm.fmuladd.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD1]], [[VEC_PHI]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = call @llvm.vp.load.nxv8f16.p0(ptr align 4 [[TMP7]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP8:%.*]] = call reassoc @llvm.fmuladd.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD1]], [[VEC_PHI]]) +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv8f16( splat (i1 true), [[TMP8]], [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP10:%.*]] = call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, [[TMP8]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[TMP16:%.*]] = call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, [[TMP9]]) +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0xH0000, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[B]], i64 [[IV]] @@ -1307,9 +1355,9 @@ define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh" ; CHECK-NEXT: [[MULADD]] = tail call reassoc half @llvm.fmuladd.f16(half [[TMP11]], half [[TMP12]], half [[SUM_07]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi half [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi half [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[MULADD_LCSSA]] ; entry: @@ -1360,7 +1408,7 @@ define half @fmuladd_f16_zvfhmin(ptr %a, ptr %b, i64 %n) "target-features"="+zvf ; CHECK-NEXT: [[TMP5]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD2]], <16 x half> [[WIDE_LOAD4]], <16 x half> [[VEC_PHI1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <16 x half> [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[BIN_RDX]]) @@ -1380,7 +1428,7 @@ define half @fmuladd_f16_zvfhmin(ptr %a, ptr %b, i64 %n) "target-features"="+zvf ; CHECK-NEXT: [[MULADD]] = tail call reassoc half @llvm.fmuladd.f16(half [[TMP8]], half [[TMP9]], half [[SUM_07]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi half [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret half [[MULADD_LCSSA]] @@ -1430,7 +1478,7 @@ define bfloat @fmuladd_bf16(ptr %a, ptr %b, i64 %n) "target-features"="+zvfbfmin ; CHECK-NEXT: [[TMP5]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD2]], <16 x bfloat> [[WIDE_LOAD4]], <16 x bfloat> [[VEC_PHI1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <16 x bfloat> [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> [[BIN_RDX]]) @@ -1450,7 +1498,7 @@ define bfloat @fmuladd_bf16(ptr %a, ptr %b, i64 %n) "target-features"="+zvfbfmin ; CHECK-NEXT: [[MULADD]] = tail call reassoc bfloat @llvm.fmuladd.bf16(bfloat [[TMP8]], bfloat [[TMP9]], bfloat [[SUM_07]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi bfloat [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret bfloat [[MULADD_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll index 5a67b54c7a3d5..346f1cbcc7e3d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll @@ -1,5 +1,9 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s + +; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow +; unrolling. Calculate register pressure for all VPlans, not just unrolled ones, +; and remove. define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll index d4909fa61b4f5..b25bc485a9ca7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll @@ -1,6 +1,10 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -prefer-predicate-over-epilogue=scalar-epilogue -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN + +; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow +; unrolling. Calculate register pressure for all VPlans, not just unrolled ones, +; and remove. define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll index 70372826a8256..116ccc9961795 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -1,25 +1,29 @@ ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ -; RUN: -force-vector-width=1 \ +; RUN: -force-vector-width=1 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALAR ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ -; RUN: -riscv-v-register-bit-width-lmul=1 \ +; RUN: -riscv-v-register-bit-width-lmul=1 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL1 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ -; RUN: -riscv-v-register-bit-width-lmul=2 \ +; RUN: -riscv-v-register-bit-width-lmul=2 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL2 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ -; RUN: -riscv-v-register-bit-width-lmul=4 \ +; RUN: -riscv-v-register-bit-width-lmul=4 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL4 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ -; RUN: -riscv-v-register-bit-width-lmul=8 \ +; RUN: -riscv-v-register-bit-width-lmul=8 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL8 +; TODO: -prefer-predicate-over-epilogue=scalar-epilogue was added to allow +; unrolling. Calculate register pressure for all VPlans, not just unrolled ones, +; and remove. + define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add ; CHECK-SCALAR: LV(REG): VF = 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll index 85163c79072b5..a0d29e8bc3164 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll @@ -5,41 +5,42 @@ define float @s311(float %a_0, float %s311_sum) { ; CHECK-LABEL: define float @s311( ; CHECK-SAME: float [[A_0:%.*]], float [[S311_SUM:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1200, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1200, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1200, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 1200, [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[A_0]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ [[S311_SUM]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 1200, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP6]] = call float @llvm.vp.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP9]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP9]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1200 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1200, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ [[S311_SUM]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[S311_SUM]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[S311_SUM]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[RED_NEXT]] = fadd float [[A_0]], [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1200 -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_LCSSA]] @@ -60,8 +61,9 @@ exit: ret float %red.lcssa } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 0b3dcf80c448b..186573f32d4fc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -19,98 +19,100 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) { ; RV64-LABEL: define void @vector_reverse_i32( ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64: [[VECTOR_PH]]: ; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; RV64-NEXT: [[N_RND_UP:%.*]] = add i64 1023, [[TMP6]] +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] -; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP24:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP24]] +; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP24]], 1 ; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[TMP14:%.*]] = add [[REVERSE]], splat (i32 1) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP16]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1 ; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP18]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP20]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV64-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP25]] +; RV64-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i64 [[TMP18]] +; RV64-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP14]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; RV64-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023 +; RV64-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64-NEXT: br [[EXIT:label %.*]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] ; RV64: [[FOR_BODY]]: ; ; RV32-LABEL: define void @vector_reverse_i32( ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; RV32-NEXT: [[ENTRY:.*]]: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV32: [[VECTOR_PH]]: ; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; RV32-NEXT: [[N_RND_UP:%.*]] = add i64 1023, [[TMP6]] +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]] -; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]] ; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1 ; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 [[TMP10]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[TMP10]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP15:%.*]] = add [[REVERSE]], splat (i32 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP9]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP9]], 1 ; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 [[TMP20]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP22]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP16]], i32 [[TMP17]] +; RV32-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 [[TMP20]] +; RV32-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP15]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64 +; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP23]] +; RV32-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023 +; RV32-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV32-NEXT: br [[EXIT:label %.*]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] ; RV32: [[FOR_BODY]]: ; @@ -206,10 +208,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; RV64: [[FOR_BODY_PREHEADER]]: ; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; RV64: [[VECTOR_SCEVCHECK]]: ; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 ; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 @@ -233,46 +232,50 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64: [[VECTOR_PH]]: ; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP16]], 1 +; RV64-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP19]] +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP16]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP20:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] ; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 ; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP22]] -; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP24:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[TMP28:%.*]] = mul i64 0, [[TMP24]] +; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], 1 ; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] -; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP24]] -; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP26]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP28]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP38:%.*]] = getelementptr i32, ptr [[TMP23]], i64 [[TMP28]] +; RV64-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[TMP38]], i64 [[TMP26]] +; RV64-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP27]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[TMP29:%.*]] = add [[REVERSE]], splat (i32 1) ; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP39:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP39]] +; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP39]], 1 ; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] -; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 [[TMP31]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP33]] -; RV64-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP29]]) -; RV64-NEXT: store [[REVERSE4]], ptr [[TMP35]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] -; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV64-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i64 [[TMP31]] +; RV64-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i64 [[TMP33]] +; RV64-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP29]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP35]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[TMP36:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]] +; RV64-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]] +; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] ; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] @@ -289,10 +292,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; RV32: [[FOR_BODY_PREHEADER]]: ; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; RV32: [[VECTOR_MEMCHECK]]: ; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 @@ -303,48 +303,48 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32: [[VECTOR_PH]]: ; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP8]], 1 +; RV32-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP11]] +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 -; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] ; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 ; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]] -; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 ; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] ; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 ; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP17]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP15]], i32 [[TMP17]] +; RV32-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP28]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP22:%.*]] = add [[REVERSE]], splat (i32 1) ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] -; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 +; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP16]] +; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP16]], 1 ; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] -; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP25]] -; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 [[TMP27]] -; RV32-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP22]]) -; RV32-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; RV32-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP23]], i32 [[TMP21]] +; RV32-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i32 [[TMP27]] +; RV32-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[TMP22]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP25]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64 +; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP29]] +; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]] +; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] ; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] @@ -487,10 +487,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; RV64: [[FOR_BODY_PREHEADER]]: ; RV64-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; RV64: [[VECTOR_SCEVCHECK]]: ; RV64-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 ; RV64-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 @@ -514,46 +511,50 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV64: [[VECTOR_PH]]: ; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP16]], 1 +; RV64-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP19]] +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP16]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV64-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4 -; RV64-NEXT: [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV64-NEXT: [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP20:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV64-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] ; RV64-NEXT: [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 ; RV64-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP22]] -; RV64-NEXT: [[TMP24:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP24:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[TMP28:%.*]] = mul i64 0, [[TMP24]] +; RV64-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], 1 ; RV64-NEXT: [[TMP26:%.*]] = mul i64 -1, [[TMP25]] -; RV64-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP24]] -; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP26]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP28]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP38:%.*]] = getelementptr float, ptr [[TMP23]], i64 [[TMP28]] +; RV64-NEXT: [[TMP27:%.*]] = getelementptr float, ptr [[TMP38]], i64 [[TMP26]] +; RV64-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP27]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP20]]) ; RV64-NEXT: [[TMP29:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV64-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] -; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP18]] -; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP18]], 1 +; RV64-NEXT: [[TMP39:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[TMP31:%.*]] = mul i64 0, [[TMP39]] +; RV64-NEXT: [[TMP32:%.*]] = sub i64 [[TMP39]], 1 ; RV64-NEXT: [[TMP33:%.*]] = mul i64 -1, [[TMP32]] -; RV64-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i64 [[TMP31]] -; RV64-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP33]] -; RV64-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP29]]) -; RV64-NEXT: store [[REVERSE4]], ptr [[TMP35]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] -; RV64-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; RV64-NEXT: [[TMP34:%.*]] = getelementptr float, ptr [[TMP30]], i64 [[TMP31]] +; RV64-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[TMP34]], i64 [[TMP33]] +; RV64-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP29]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP35]], splat (i1 true), i32 [[TMP20]]) +; RV64-NEXT: [[TMP36:%.*]] = zext i32 [[TMP20]] to i64 +; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP36]], [[INDEX]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP36]] +; RV64-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]] +; RV64-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV64-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV64-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV64-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] ; RV64: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; RV64-NEXT: br label %[[FOR_COND_CLEANUP]] @@ -570,10 +571,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] ; RV32: [[FOR_BODY_PREHEADER]]: ; RV32-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; RV32: [[VECTOR_MEMCHECK]]: ; RV32-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() ; RV32-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 @@ -584,48 +582,48 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; RV32: [[VECTOR_PH]]: ; RV32-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP8]], 1 +; RV32-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP11]] +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV32-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 -; RV32-NEXT: [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; RV32-NEXT: [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV32-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32 ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]] ; RV32-NEXT: [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 ; RV32-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; RV32-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32 ; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP16]] ; RV32-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], 1 ; RV32-NEXT: [[TMP19:%.*]] = mul i32 -1, [[TMP18]] -; RV32-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 [[TMP17]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[TMP15]], i32 [[TMP17]] +; RV32-NEXT: [[TMP28:%.*]] = getelementptr float, ptr [[TMP20]], i32 [[TMP19]] +; RV32-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP28]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP16]]) ; RV32-NEXT: [[TMP22:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV32-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; RV32-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32 -; RV32-NEXT: [[TMP25:%.*]] = mul i32 0, [[TMP24]] -; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], 1 +; RV32-NEXT: [[TMP21:%.*]] = mul i32 0, [[TMP16]] +; RV32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP16]], 1 ; RV32-NEXT: [[TMP27:%.*]] = mul i32 -1, [[TMP26]] -; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP25]] -; RV32-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 [[TMP27]] -; RV32-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP22]]) -; RV32-NEXT: store [[REVERSE4]], ptr [[TMP29]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; RV32-NEXT: [[TMP24:%.*]] = getelementptr float, ptr [[TMP23]], i32 [[TMP21]] +; RV32-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP24]], i32 [[TMP27]] +; RV32-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP22]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP25]], splat (i1 true), i32 [[TMP16]]) +; RV32-NEXT: [[TMP29:%.*]] = zext i32 [[TMP16]] to i64 +; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP29]], [[INDEX]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP29]] +; RV32-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]] +; RV32-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; RV32-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] -; RV32-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ] +; RV32-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] ; RV32: [[FOR_COND_CLEANUP_LOOPEXIT]]: ; RV32-NEXT: br label %[[FOR_COND_CLEANUP]] @@ -762,98 +760,100 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) { ; RV64-LABEL: define void @vector_reverse_f32_simplify( ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { ; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV64: [[VECTOR_PH]]: ; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV64-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV64-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; RV64-NEXT: [[N_RND_UP:%.*]] = add i64 1023, [[TMP6]] +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; RV64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; RV64-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV64-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV64-NEXT: br label %[[VECTOR_BODY:.*]] ; RV64: [[VECTOR_BODY]]: -; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV64-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV64-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] -; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP24:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP24]] +; RV64-NEXT: [[TMP10:%.*]] = sub i64 [[TMP24]], 1 ; RV64-NEXT: [[TMP11:%.*]] = mul i64 -1, [[TMP10]] -; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]] -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]] -; RV64-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 4 -; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV64-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[TMP9]] +; RV64-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i64 [[TMP11]] +; RV64-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP19]]) ; RV64-NEXT: [[TMP14:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV64-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; RV64-NEXT: [[TMP16:%.*]] = mul i64 0, [[TMP5]] -; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP5]], 1 +; RV64-NEXT: [[TMP16:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[TMP25:%.*]] = mul i64 0, [[TMP16]] +; RV64-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1 ; RV64-NEXT: [[TMP18:%.*]] = mul i64 -1, [[TMP17]] -; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]] -; RV64-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP18]] -; RV64-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP14]]) -; RV64-NEXT: store [[REVERSE1]], ptr [[TMP20]], align 4 -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[TMP25]] +; RV64-NEXT: [[TMP21:%.*]] = getelementptr float, ptr [[TMP20]], i64 [[TMP18]] +; RV64-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP14]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP19]]) +; RV64-NEXT: [[TMP22:%.*]] = zext i32 [[TMP19]] to i64 +; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; RV64-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023 +; RV64-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: -; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV64-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV64-NEXT: br [[EXIT:label %.*]] ; RV64: [[SCALAR_PH]]: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] ; RV64-NEXT: br label %[[FOR_BODY:.*]] ; RV64: [[FOR_BODY]]: ; ; RV32-LABEL: define void @vector_reverse_f32_simplify( ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] { ; RV32-NEXT: [[ENTRY:.*]]: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]] -; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; RV32-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; RV32: [[VECTOR_PH]]: ; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]] -; RV32-NEXT: [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]] +; RV32-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; RV32-NEXT: [[N_RND_UP:%.*]] = add i64 1023, [[TMP6]] +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; RV32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; RV32-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; RV32-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; RV32-NEXT: [[TMP6:%.*]] = sub i64 1023, [[N_VEC]] ; RV32-NEXT: br label %[[VECTOR_BODY:.*]] ; RV32: [[VECTOR_BODY]]: -; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 1023, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; RV32-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; RV32-NEXT: [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1 ; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] -; RV32-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32 ; RV32-NEXT: [[TMP10:%.*]] = mul i32 0, [[TMP9]] ; RV32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], 1 ; RV32-NEXT: [[TMP12:%.*]] = mul i32 -1, [[TMP11]] -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 [[TMP10]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]] -; RV32-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[WIDE_LOAD]]) +; RV32-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP8]], i32 [[TMP10]] +; RV32-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP13]], i32 [[TMP12]] +; RV32-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP9]]) ; RV32-NEXT: [[TMP15:%.*]] = fadd [[REVERSE]], splat (float 1.000000e+00) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; RV32-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32 -; RV32-NEXT: [[TMP18:%.*]] = mul i32 0, [[TMP17]] -; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP17]], 1 +; RV32-NEXT: [[TMP17:%.*]] = mul i32 0, [[TMP9]] +; RV32-NEXT: [[TMP19:%.*]] = sub i32 [[TMP9]], 1 ; RV32-NEXT: [[TMP20:%.*]] = mul i32 -1, [[TMP19]] -; RV32-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]] -; RV32-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP20]] -; RV32-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4f32( [[TMP15]]) -; RV32-NEXT: store [[REVERSE1]], ptr [[TMP22]], align 4 -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; RV32-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: [[TMP18:%.*]] = getelementptr float, ptr [[TMP16]], i32 [[TMP17]] +; RV32-NEXT: [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 [[TMP20]] +; RV32-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv4f32( [[TMP15]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: call void @llvm.vp.store.nxv4f32.p0( [[VP_REVERSE1]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP9]]) +; RV32-NEXT: [[TMP23:%.*]] = zext i32 [[TMP9]] to i64 +; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP23]], [[INDEX]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP23]] +; RV32-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023 +; RV32-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: -; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]] -; RV32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; RV32-NEXT: br [[EXIT:label %.*]] ; RV32: [[SCALAR_PH]]: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] ; RV32-NEXT: br label %[[FOR_BODY:.*]] ; RV32: [[FOR_BODY]]: ; @@ -984,7 +984,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV64-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV64-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 -; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: [[MIDDLE_BLOCK]]: ; RV64-NEXT: br label %[[SCALAR_PH]] ; RV64: [[SCALAR_PH]]: @@ -1036,7 +1036,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) { ; RV32-NEXT: store i7 [[TMP28]], ptr [[TMP24]], align 1 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; RV32-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020 -; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RV32-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; RV32: [[MIDDLE_BLOCK]]: ; RV32-NEXT: br label %[[SCALAR_PH]] ; RV32: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index 3370e921b089b..770026ed3eef6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -10,36 +10,38 @@ target triple = "riscv64" define void @test(ptr %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 200, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 32 +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP7]], splat (i1 true), i32 [[TMP8]]) ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 200 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP10]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP10]], splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 200 @@ -47,7 +49,7 @@ define void @test(ptr %p) { ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -86,7 +88,7 @@ define void @test_may_clobber(ptr %p) { ; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: @@ -101,7 +103,7 @@ define void @test_may_clobber(ptr %p) { ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -127,36 +129,38 @@ exit: define void @trivial_due_max_vscale(ptr %p) { ; CHECK-LABEL: @trivial_due_max_vscale( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 200, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 32 +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP7]], splat (i1 true), i32 [[TMP8]]) ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 8192 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP10]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP10]], splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 8192 @@ -164,7 +168,7 @@ define void @trivial_due_max_vscale(ptr %p) { ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -190,36 +194,38 @@ exit: define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-LABEL: @no_high_lmul_or_interleave( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP6]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 200, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 32 +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP7]], splat (i1 true), i32 [[TMP8]]) ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1024 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP10]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP_LOAD]], ptr align 32 [[TMP10]], splat (i1 true), i32 [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 1024 @@ -227,7 +233,7 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -277,7 +283,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -291,7 +297,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) { ; CHECK-NEXT: store i16 0, ptr [[GEP_OFF]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll index e51f6fa7484c8..b8d9c2838c652 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -8,15 +8,14 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @vector_add( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -24,28 +23,31 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP8]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP8]], ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ELEM]], [[V]] ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -72,15 +74,14 @@ for.end: define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; CHECK-LABEL: @vector_add_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 @@ -88,28 +89,31 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP8]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP8]], ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ELEM]], [[V]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -174,15 +178,14 @@ for.end: define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; CHECK-LABEL: @indexed_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -190,28 +193,31 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP8]], i32 8, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], align 8 [[TMP8]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8 ; CHECK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]] ; CHECK-NEXT: store i64 [[V]], ptr [[AADDR]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -235,40 +241,43 @@ for.end: define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; CHECK-LABEL: @indexed_load( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP7]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, splat (i1 true), poison) -; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP9]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[TMP13]], [[VEC_PHI]], i32 [[TMP12]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8 ; CHECK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]] @@ -276,7 +285,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] @@ -303,15 +312,14 @@ for.end: define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @splat_int( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 @@ -319,24 +327,27 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -358,15 +369,14 @@ for.end: define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; CHECK-LABEL: @splat_ptr( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 @@ -374,24 +384,27 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP6]], splat (i1 true), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store ptr [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll index 5c6febc41da2d..2d2d76a5dd9d0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll @@ -5,50 +5,52 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) { ; CHECK-LABEL: define i32 @select_icmp( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp sge [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8]] = or [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i1( [[TMP7]], splat (i1 true), [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], [[X]] ; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[COND_LCSSA]] @@ -75,50 +77,52 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) { ; CHECK-LABEL: define i32 @select_fcmp( ; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr readonly captures(none) [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP14]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast uge [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8]] = or [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i1( [[TMP7]], splat (i1 true), [[VEC_PHI]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP12]], [[X]] ; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[COND_LCSSA]] @@ -145,48 +149,50 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: define i32 @select_const_i32_from_icmp( ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP21]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], splat (i32 3) -; CHECK-NEXT: [[TMP8]] = or [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i1( [[TMP7]], splat (i1 true), [[VEC_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 7, i32 3 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ 3, %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 3 ; CHECK-NEXT: [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 7 ; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP12]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]] -; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[DOTLCSSA]] @@ -213,48 +219,50 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; CHECK-LABEL: define i32 @select_i32_from_icmp( ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP21]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], splat (i32 3) -; CHECK-NEXT: [[TMP8]] = or [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i1( [[TMP7]], splat (i1 true), [[VEC_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[B]], i32 [[A]] -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[A]], %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 3 ; CHECK-NEXT: [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 [[B]] ; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP12]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]] -; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[DOTLCSSA]] @@ -281,48 +289,50 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: define i32 @select_const_i32_from_fcmp( ; CHECK-SAME: ptr readonly captures(none) [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP21]]) ; CHECK-NEXT: [[TMP7:%.*]] = fcmp fast one [[WIDE_LOAD]], splat (float 3.000000e+00) -; CHECK-NEXT: [[TMP8]] = or [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP8]] = call @llvm.vp.merge.nxv4i1( [[TMP7]], splat (i1 true), [[VEC_PHI]], i32 [[TMP21]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP10]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 1, i32 2 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast ueq float [[TMP15]], 3.000000e+00 ; CHECK-NEXT: [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP12]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]] -; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[DOTLCSSA]] @@ -386,45 +396,53 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1 ; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp( ; CHECK-SAME: ptr noalias readonly captures(none) [[SRC1:%.*]], ptr noalias readonly captures(none) [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP19:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP17]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt [[WIDE_LOAD]], splat (i32 35) +; CHECK-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], [[TMP7]], i32 [[TMP17]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], splat (i32 2) ; CHECK-NEXT: [[TMP10:%.*]] = or [[VEC_PHI]], [[TMP9]] -; CHECK-NEXT: [[PREDPHI]] = select [[TMP7]], [[TMP10]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select [[TMP20]], [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI]] = call @llvm.vp.merge.nxv4i1( splat (i1 true), [[PREDPHI1]], [[VEC_PHI]], i32 [[TMP17]]) +; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[PREDPHI]]) ; CHECK-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %[[FOR_INC]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[SCALAR_PH]] ] +; CHECK-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %[[FOR_INC]] ], [ 0, %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP14]], 35 @@ -439,7 +457,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1 ; CHECK-NEXT: [[R_1]] = phi i32 [ [[R_012]], %[[FOR_BODY]] ], [ [[SPEC_SELECT]], %[[IF_THEN]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[FOR_END_LOOPEXIT]]: ; CHECK-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %[[FOR_INC]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[R_1_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 13a4b166431c8..8c804e5e62f5c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -4,28 +4,16 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_range(4,1024) { ; CHECK-LABEL: @small_trip_count_min_vlen_128( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP1]], align 4 -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[EXIT:%.*]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP1]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -49,28 +37,16 @@ exit: define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_range(1,1024) { ; CHECK-LABEL: @small_trip_count_min_vlen_32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[TMP1]], align 4 -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[EXIT:%.*]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP1]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index df907dcbd4606..f0a74fc2b8bf0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -6,45 +6,47 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-LABEL: @single_constant_stride_int_scaled( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP7]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP12]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP14]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP15]], i32 4, splat (i1 true), poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP16]], [[TMP15]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP16]], align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH1]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET:%.*]] = mul nuw nsw i64 [[I]], 8 ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET]] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -52,7 +54,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[SCALAR_PH]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -77,46 +79,48 @@ exit: define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-LABEL: @single_constant_stride_int_iv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP10]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 64 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 64) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 64, [[TMP5]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 64, [[TMP11]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, splat (i1 true), poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP12]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP13]], align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[OFFSET_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[OFFSET_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET]] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 @@ -124,7 +128,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], 64 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -151,55 +155,52 @@ exit: define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-LABEL: @single_constant_stride_ptr_iv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[N_VEC]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul [[TMP14]], splat (i64 8) ; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[VECTOR_GEP]], i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP17]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[VECTOR_GEP]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP19]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[VECTOR_GEP]], i32 4, splat (i1 true)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP8]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[VECTOR_GEP]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP10]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[P]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[PTR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH1]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[P]], [[SCALAR_PH1]] ], [ [[PTR_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[PTR]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 ; CHECK-NEXT: store i32 [[Y0]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 ; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[DONE]], label [[SCALAR_PH]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -225,36 +226,37 @@ exit: define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-LABEL: @single_stride_int_scaled( ; NOSTRIDED-NEXT: entry: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) -; NOSTRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; NOSTRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED: vector.scevcheck: ; NOSTRIDED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1 ; NOSTRIDED-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; NOSTRIDED: vector.ph: ; NOSTRIDED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; NOSTRIDED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1 +; NOSTRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] +; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; NOSTRIDED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]] -; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) ; NOSTRIDED-NEXT: [[TMP10:%.*]] = add [[WIDE_LOAD]], splat (i32 1) -; NOSTRIDED-NEXT: store [[TMP10]], ptr [[TMP8]], align 4 -; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP10]], ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) +; NOSTRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; NOSTRIDED: middle.block: -; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -265,7 +267,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -306,37 +308,38 @@ exit: define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-LABEL: @single_stride_int_iv( ; NOSTRIDED-NEXT: entry: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) -; NOSTRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; NOSTRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED: vector.scevcheck: ; NOSTRIDED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1 ; NOSTRIDED-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; NOSTRIDED: vector.ph: ; NOSTRIDED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; NOSTRIDED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1 +; NOSTRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] +; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; NOSTRIDED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]] -; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) ; NOSTRIDED-NEXT: [[TMP10:%.*]] = add [[WIDE_LOAD]], splat (i32 1) -; NOSTRIDED-NEXT: store [[TMP10]], ptr [[TMP8]], align 4 -; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP10]], ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) +; NOSTRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; NOSTRIDED: middle.block: -; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -348,7 +351,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -429,11 +432,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: entry: ; NOSTRIDED-NEXT: [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64 ; NOSTRIDED-NEXT: [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64 -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) -; NOSTRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; NOSTRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED: vector.scevcheck: ; NOSTRIDED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1 ; NOSTRIDED-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] @@ -447,26 +446,31 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED: vector.ph: ; NOSTRIDED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP8]] -; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; NOSTRIDED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP8]], 1 +; NOSTRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP11]] +; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; NOSTRIDED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], i64 [[INDEX]] -; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP16]]) ; NOSTRIDED-NEXT: [[TMP14:%.*]] = add [[WIDE_LOAD]], splat (i32 1) ; NOSTRIDED-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P2]], i64 [[INDEX]] -; NOSTRIDED-NEXT: store [[TMP14]], ptr [[TMP15]], align 4 -; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; NOSTRIDED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP14]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP16]]) +; NOSTRIDED-NEXT: [[TMP13:%.*]] = zext i32 [[TMP16]] to i64 +; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; NOSTRIDED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NOSTRIDED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; NOSTRIDED: middle.block: -; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -478,17 +482,13 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; ; STRIDED-LABEL: @double_stride_int_scaled( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; STRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 80, i64 [[TMP1]]) -; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; STRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; STRIDED: vector.scevcheck: ; STRIDED-NEXT: [[TMP24:%.*]] = shl i64 [[STRIDE:%.*]], 2 ; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], -4 @@ -539,8 +539,10 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED: vector.ph: ; STRIDED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 -; STRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]] -; STRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; STRIDED-NEXT: [[TMP42:%.*]] = sub i64 [[TMP9]], 1 +; STRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP42]] +; STRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; STRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; STRIDED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4 ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 @@ -548,28 +550,32 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i64 1) ; STRIDED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] -; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP11]] -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64 +; STRIDED-NEXT: [[TMP45:%.*]] = mul i64 1, [[TMP44]] +; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP45]], i64 0 +; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT1]] ; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], [[TMP18]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, splat (i1 true), poison), !alias.scope [[META8:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP19]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]] ; STRIDED-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], [[TMP18]] -; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP20]], [[TMP21]], i32 4, splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]] -; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[TMP21]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META12:![0-9]+]], !noalias [[META9]] +; STRIDED-NEXT: [[TMP46:%.*]] = zext i32 [[TMP43]] to i64 +; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP46]], [[INDEX]] +; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP46]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; STRIDED: middle.block: -; STRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; STRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: -; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: loop: ; STRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -581,7 +587,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4 ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; @@ -607,37 +613,38 @@ exit: define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-LABEL: @double_stride_int_iv( ; NOSTRIDED-NEXT: entry: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) -; NOSTRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; NOSTRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED: vector.scevcheck: ; NOSTRIDED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[STRIDE:%.*]], 1 ; NOSTRIDED-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; NOSTRIDED: vector.ph: ; NOSTRIDED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; NOSTRIDED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP4]], 1 +; NOSTRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] +; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; NOSTRIDED-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; NOSTRIDED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]] -; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) ; NOSTRIDED-NEXT: [[TMP10:%.*]] = add [[WIDE_LOAD]], splat (i32 1) -; NOSTRIDED-NEXT: store [[TMP10]], ptr [[TMP8]], align 4 -; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP10]], ptr align 4 [[TMP8]], splat (i1 true), i32 [[TMP7]]) +; NOSTRIDED-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; NOSTRIDED: middle.block: -; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; NOSTRIDED-NEXT: br label [[EXIT:%.*]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -650,7 +657,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]] ; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; NOSTRIDED: exit: ; NOSTRIDED-NEXT: ret void ; @@ -692,7 +699,6 @@ exit: ret void } - define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-LABEL: @double_stride_ptr_iv( ; NOSTRIDED-NEXT: entry: @@ -714,11 +720,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; ; STRIDED-LABEL: @double_stride_ptr_iv( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; STRIDED-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]]) -; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; STRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; STRIDED: vector.memcheck: ; STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[STRIDE:%.*]], 1023 ; STRIDED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P2:%.*]], i64 [[TMP3]] @@ -740,19 +742,18 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED: vector.ph: ; STRIDED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 -; STRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]] -; STRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; STRIDED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; STRIDED-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP10]] +; STRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]] +; STRIDED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; STRIDED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4 -; STRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]] -; STRIDED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] -; STRIDED-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]] -; STRIDED-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]] ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[POINTER_PHI11:%.*]] = phi ptr [ [[P2]], [[VECTOR_PH]] ], [ [[PTR_IND12:%.*]], [[VECTOR_BODY]] ] +; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[DOTSPLAT10:%.*]] = shufflevector [[DOTSPLATINSERT9]], poison, zeroinitializer @@ -761,23 +762,27 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP27:%.*]] = call @llvm.stepvector.nxv4i64() ; STRIDED-NEXT: [[TMP21:%.*]] = mul [[TMP27]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP21]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[VECTOR_GEP7]], i32 4, splat (i1 true), poison), !alias.scope [[META15:![0-9]+]] +; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[VECTOR_GEP7]], splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]] ; STRIDED-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) -; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP30]], [[VECTOR_GEP]], i32 4, splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]] -; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] -; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]] +; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP30]], align 4 [[VECTOR_GEP]], splat (i1 true), i32 [[TMP14]]), !alias.scope [[META19:![0-9]+]], !noalias [[META16]] +; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] +; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; STRIDED-NEXT: [[TMP20:%.*]] = zext i32 [[TMP14]] to i64 +; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP20]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] -; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]] +; STRIDED-NEXT: [[TMP22:%.*]] = zext i32 [[TMP14]] to i64 +; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP22]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP17]] -; STRIDED-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; STRIDED-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; STRIDED: middle.block: -; STRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; STRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; STRIDED-NEXT: br label [[EXIT:%.*]] ; STRIDED: scalar.ph: -; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; STRIDED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY]] ], [ [[P]], [[VECTOR_MEMCHECK]] ] -; STRIDED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[P2]], [[ENTRY]] ], [ [[P2]], [[VECTOR_MEMCHECK]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[P]], [[ENTRY]] ], [ [[P]], [[VECTOR_MEMCHECK]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[P2]], [[ENTRY]] ], [ [[P2]], [[VECTOR_MEMCHECK]] ] ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: loop: ; STRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -790,7 +795,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]] ; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1 ; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; STRIDED: exit: ; STRIDED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 034b76741bab6..68d55d01a7d2e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -8,15 +8,14 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-LABEL: define void @test_pr98413_zext_removed( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 97, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 97, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 97, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 97, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[X]], i64 0 @@ -24,24 +23,27 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-NEXT: [[TMP6:%.*]] = trunc [[BROADCAST_SPLAT]] to ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 97, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP10:%.*]] = trunc [[WIDE_LOAD]] to ; CHECK-NEXT: [[TMP11:%.*]] = and [[TMP6]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP11]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[TMP7]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 97 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 97, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 8 ; CHECK-NEXT: [[EXT_L:%.*]] = zext i16 [[L]] to i64 @@ -51,7 +53,7 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-NEXT: store i8 [[TRUNC_AND]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 96 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -79,15 +81,14 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-LABEL: define void @test_pr98413_sext_removed( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 97, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 97, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 97, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 97, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[X]], i64 0 @@ -95,24 +96,27 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-NEXT: [[TMP6:%.*]] = trunc [[BROADCAST_SPLAT]] to ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 97, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv8i16.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP13]]) ; CHECK-NEXT: [[TMP10:%.*]] = trunc [[WIDE_LOAD]] to ; CHECK-NEXT: [[TMP11:%.*]] = and [[TMP6]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP11]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[TMP7]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 97 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 97, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 8 ; CHECK-NEXT: [[EXT_L:%.*]] = sext i16 [[L]] to i64 @@ -122,7 +126,7 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-NEXT: store i8 [[TRUNC_AND]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 96 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -172,7 +176,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 9 -; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: @@ -259,15 +263,14 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[N]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[V]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP13:%.*]] = sub i64 [[TMP4]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP13]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[N]], i64 0 @@ -284,20 +287,23 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[BROADCAST_SPLAT4]], i32 8, [[TMP8]], poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[BROADCAST_SPLAT4]], [[TMP8]], i32 [[TMP14]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], align 8 [[BROADCAST_SPLAT6]], [[TMP8]], i32 [[TMP14]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[T1:%.*]] = trunc i64 [[N]] to i32 ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[T1]], [[T]] ; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] @@ -344,15 +350,15 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } attributes #1 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META7:![0-9]+]], [[META2]]} -; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index d97e93d892a06..722fe5ed92ddd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -10,42 +10,44 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; SCALABLE-LABEL: define void @uniform_load( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP6]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: [[TMP7:%.*]] = load i64, ptr [[B]], align 8 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP10]]) +; SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 ; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -168,7 +170,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] @@ -182,7 +184,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] ; SCALABLE-NEXT: ret i64 [[V_LCSSA]] @@ -286,15 +288,14 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-LABEL: define void @conditional_uniform_load( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP14:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP14]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B]], i64 0 @@ -302,30 +303,39 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP8:%.*]] = zext i32 [[TMP17]] to i64 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv4i32() +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT4]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) -; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP10]], poison) -; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer +; SCALABLE-NEXT: [[TMP13:%.*]] = select [[TMP11]], [[TMP10]], zeroinitializer +; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i64.nxv4p0( align 8 [[BROADCAST_SPLAT]], [[TMP10]], i32 [[TMP17]]) +; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[WIDE_MASKED_GATHER]], zeroinitializer ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP12]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[PREDPHI]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP17]]) +; SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP17]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] ; SCALABLE-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 ; SCALABLE-NEXT: br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]] ; SCALABLE: [[DO_LOAD]]: @@ -337,7 +347,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -482,42 +492,44 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-LABEL: define void @uniform_load_unaligned( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP7:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP7]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B]], align 1 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP10]]) +; SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1 ; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -619,42 +631,44 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-LABEL: define void @uniform_store( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP6]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 ; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP9]]) +; SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 ; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -756,54 +770,55 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-LABEL: define void @uniform_store_of_loop_varying( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP9:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP9]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP13:%.*]] = mul [[TMP6]], splat (i64 1) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX:%.*]] = mul i64 1, [[TMP8]] ; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 ; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] -; SCALABLE-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i64 1) -; SCALABLE-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT]], [[TMP8]] -; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() -; SCALABLE-NEXT: [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 2 -; SCALABLE-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 -; SCALABLE-NEXT: [[TMP15:%.*]] = extractelement [[TMP9]], i32 [[TMP14]] -; SCALABLE-NEXT: store i64 [[TMP15]], ptr [[B]], align 8 +; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT1]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP16]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP16]], splat (i1 true), i32 [[TMP7]]) +; SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[TMP10]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; SCALABLE-NEXT: store i64 [[IV]], ptr [[B]], align 8 ; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -920,15 +935,14 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-LABEL: define void @conditional_uniform_store( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP11:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP11]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[V]], i64 0 @@ -938,29 +952,33 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() ; SCALABLE-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 1) ; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP14]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) -; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], [[BROADCAST_SPLAT2]], i32 8, [[TMP10]]) +; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT1]], ptr [[TMP12]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP7]]) +; SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] ; SCALABLE-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 ; SCALABLE-NEXT: br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]] ; SCALABLE: [[DO_STORE]]: @@ -971,7 +989,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; @@ -1109,42 +1127,44 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-LABEL: define void @uniform_store_unaligned( ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; SCALABLE-NEXT: [[ENTRY:.*]]: -; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SCALABLE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SCALABLE: [[VECTOR_PH]]: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP6:%.*]] = sub i64 [[TMP3]], 1 +; SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP6]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; SCALABLE: [[VECTOR_BODY]]: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 ; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP9]]) +; SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]] +; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; SCALABLE: [[MIDDLE_BLOCK]]: -; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] -; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; SCALABLE-NEXT: br label %[[FOR_END:.*]] ; SCALABLE: [[SCALAR_PH]]: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; SCALABLE: [[FOR_BODY]]: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 ; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; SCALABLE: [[FOR_END]]: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index bda983983b70a..31501f6c1f104 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -17,17 +17,28 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP9]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP10]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT3]], align 4 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP5]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -41,7 +52,7 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: store i32 [[DOTPRE]], ptr [[DST]], align 4 ; CHECK-NEXT: [[TMP3]] = add nuw i64 [[TMP2]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[TMP3]], 100 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -65,8 +76,9 @@ exit: ; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} ; CHECK: [[META3]] = !{[[META4:![0-9]+]]} ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]} ; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]} +; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll index d7c9ce4216c1e..46695221c27db 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll @@ -10,8 +10,7 @@ ; RUN: -disable-output < %s 2>&1 | FileCheck %s define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { -; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[OTC:%.+]]> = original trip-count @@ -21,41 +20,42 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[RESUME_IV_A:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1> -; CHECK-NEXT: vp<[[RESUME_IV_B:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[INDUCTION:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]> -; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[INDUCTION]]> * ir<-1> -; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[VF]]> +; CHECK-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%.+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ vp<[[OTC]]>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ] +; CHECK-NEXT: EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[EVL_PHI]]> * ir<-1> +; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[EVL]]> ; CHECK-NEXT: CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1> ; CHECK-NEXT: CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]> ; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]> -; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_B]]>, vp<[[VF]]> -; CHECK-NEXT: WIDEN ir<[[VAL_B:%.+]]> = load vp<[[VEC_END_PTR_B]]> +; CHECK-NEXT: vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_B]]>, vp<[[EVL]]> +; CHECK-NEXT: WIDEN ir<[[VAL_B:%.+]]> = vp.load vp<[[VEC_END_PTR_B]]>, vp<[[EVL]]> ; CHECK-NEXT: WIDEN ir<[[ADD_RESULT:%.+]]> = add ir<[[VAL_B]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]> -; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_A]]>, vp<[[VF]]> -; CHECK-NEXT: WIDEN store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]> -; CHECK-NEXT: EMIT vp<[[INDEX_NEXT]]> = add nuw vp<[[INDUCTION]]>, vp<[[VFxUF]]> +; CHECK-NEXT: vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer ir<[[ARRAY_IDX_A]]>, vp<[[EVL]]> +; CHECK-NEXT: WIDEN vp.store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]>, vp<[[EVL]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[EVL]]>, vp<[[EVL_PHI]]> +; CHECK-NEXT: EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[EVL]]> +; CHECK-NEXT: EMIT vp<[[INDEX_NEXT]]> = add vp<[[INDUCTION]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[OTC]]>, vp<[[VTC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[RESUME_IV_A]]>, middle.block ], [ ir<%n>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<[[RESUME_IV_B]]>, middle.block ], [ ir<%n>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ ir<%n>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ ir<%n>, ir-bb ] ; CHECK-NEXT: Successor(s): ir-bb ; entry: