diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 12bf8c1b4de70..d62d99cf31899 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -116,8 +116,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase { } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override { - return ST->hasVInstructions() ? TailFoldingStyle::Data - : TailFoldingStyle::DataWithoutLaneMask; + return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL + : TailFoldingStyle::None; } std::optional getMaxVScale() const override; std::optional getVScaleForTuning() const override; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 6c57d2f2f00e6..c5144645269da 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -133,16 +133,23 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8:%.*]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 8, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9]], splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11:%.*]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP13]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -150,16 +157,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[I_08]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] ; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[I_08]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] ; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -215,7 +222,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -272,7 +279,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -315,7 +322,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -332,7 +339,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index a1201dcfbdf57..02288112f3389 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -7,29 +7,49 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48) -; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52) -; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 9, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shl [[BROADCAST_SPLAT2]], splat (i64 48) +; CHECK-NEXT: [[TMP6:%.*]] = ashr [[TMP5]], splat (i64 52) +; CHECK-NEXT: [[TMP7:%.*]] = trunc [[TMP6]] to +; CHECK-NEXT: [[TMP8:%.*]] = zext [[BROADCAST_SPLAT]] to +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[P]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9) -; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8) -; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[P]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16) -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_COND]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP16:%.*]] = shl [[PREDPHI]], splat (i32 8) +; CHECK-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( [[TMP17]], align 1 [[BROADCAST_SPLAT4]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT1:%.*]] ; CHECK: scalar.ph: @@ -52,7 +72,7 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8 ; CHECK-NEXT: store i8 [[TRUNC]], ptr [[P]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV]], 8 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -84,8 +104,9 @@ exit: ; preds = %for.body ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index ff9c58525e51c..b046f61e4d50e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -24,12 +24,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP10]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -46,7 +50,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -87,15 +91,19 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], align 8 [[TMP10]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -109,7 +117,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: store i64 [[V]], ptr [[AADDR]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -146,20 +154,24 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP11]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP10]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP11]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[TMP12]], [[VEC_PHI]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP11]]) ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] @@ -175,7 +187,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]] ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] @@ -217,13 +229,17 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -235,7 +251,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -272,14 +288,18 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -292,7 +312,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -363,15 +383,19 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1024) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP10:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP10]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: @@ -385,7 +409,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 528cec077d8a9..b56e7128af4c2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -170,15 +170,11 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP15]], i32 9) -; CHECK-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], splat (i1 true), zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[BROADCAST_SPLAT2]], i32 1, [[TMP11]]) +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i8.nxv4p0( zeroinitializer, align 1 [[BROADCAST_SPLAT2]], splat (i1 true), i32 [[TMP6]]) +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -199,7 +195,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 8 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -298,7 +294,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -319,7 +315,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -359,8 +355,9 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META7:![0-9]+]], [[META2]]} +; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 8baf9d90c795a..c6955f11d8520 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -2,9 +2,6 @@ ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=SCALABLE ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=FIXEDLEN ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE -; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-FIXEDLEN - - target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" @@ -103,15 +100,19 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 8 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP6]]) +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: @@ -126,44 +127,10 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_load( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -277,22 +244,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ] ; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]] ; -; TF-FIXEDLEN-LABEL: define i64 @uniform_load_outside_use( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: ret i64 [[V_LCSSA]] -; entry: br label %for.body @@ -437,25 +388,31 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i64() ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule [[VEC_IND]], splat (i64 1024) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer -; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP9]], poison) +; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i64.nxv4p0( align 8 [[BROADCAST_SPLAT]], [[TMP10]], i32 [[TMP7]]) ; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_MASKED_GATHER]], zeroinitializer ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[PREDPHI]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[PREDPHI]], ptr align 8 [[TMP13]], splat (i1 true), i32 [[TMP7]]) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -474,55 +431,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @conditional_uniform_load( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10) -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer -; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison) -; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] -; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 -; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]] -; TF-FIXEDLEN: [[DO_LOAD]]: -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[LATCH]] -; TF-FIXEDLEN: [[LATCH]]: -; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ] -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -640,17 +552,21 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B]], align 1 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP6]]) +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = zext i32 [[TMP6]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -663,44 +579,10 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_load_unaligned( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = load i64, ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -813,15 +695,19 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -834,44 +720,10 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1003,22 +855,27 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT3]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT3]], ptr align 8 [[TMP11]], splat (i1 true), i32 [[TMP9]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1031,71 +888,10 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store_of_loop_varying( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; TF-FIXEDLEN-NEXT: br i1 [[TMP0]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF]]: -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: store i64 [[TMP1]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE]]: -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; TF-FIXEDLEN-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF1]]: -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 -; TF-FIXEDLEN-NEXT: store i64 [[TMP3]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE2]]: -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; TF-FIXEDLEN: [[PRED_STORE_IF3]]: -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: store i64 [[TMP5]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE4]]: -; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; TF-FIXEDLEN-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; TF-FIXEDLEN: [[PRED_STORE_IF5]]: -; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 3 -; TF-FIXEDLEN-NEXT: store i64 [[TMP7]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; TF-FIXEDLEN: [[PRED_STORE_CONTINUE6]]: -; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[IV]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1240,24 +1036,28 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP5]], splat (i64 1) ; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP4]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], [[BROADCAST_SPLAT2]], i32 8, [[TMP9]]) +; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT1]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT1]], ptr align 8 [[TMP13]], splat (i1 true), i32 [[TMP9]]) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1275,55 +1075,10 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @conditional_uniform_store( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]]) -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) -; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] -; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10 -; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]] -; TF-FIXEDLEN: [[DO_STORE]]: -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; TF-FIXEDLEN-NEXT: br label %[[LATCH]] -; TF-FIXEDLEN: [[LATCH]]: -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body @@ -1442,15 +1197,19 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]] ; TF-SCALABLE: [[VECTOR_BODY]]: -; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025) +; TF-SCALABLE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = zext i32 [[TMP5]] to i64 +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]] +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: @@ -1463,44 +1222,10 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; TF-SCALABLE: [[FOR_END]]: ; TF-SCALABLE-NEXT: ret void ; -; TF-FIXEDLEN-LABEL: define void @uniform_store_unaligned( -; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; TF-FIXEDLEN-NEXT: [[ENTRY:.*]]: -; TF-FIXEDLEN-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; TF-FIXEDLEN: [[VECTOR_PH]]: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: br label %[[VECTOR_BODY:.*]] -; TF-FIXEDLEN: [[VECTOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025) -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]]) -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] -; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] -; TF-FIXEDLEN: [[FOR_BODY]]: -; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 -; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 -; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; TF-FIXEDLEN: [[FOR_END]]: -; TF-FIXEDLEN-NEXT: ret void -; entry: br label %for.body