Skip to content

Commit 27db46f

Browse files
committed
[VPlan] Replicate VPScalarIVStepsRecipe by VF outside replicate regions.
Extend replicateByVF to also handle VPScalarIVStepsRecipe. To do so, the patch adds a new lane operand to VPScalarIVStepsRecipe, which is only added when replicating. This enables removing a number of lane 0 computations. The lane operand will also be used to explicitly replicate replicate regions in a follow-up. Depends on #169796 (included in PR).
1 parent 55f2774 commit 27db46f

File tree

74 files changed

+2046
-2172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2046
-2172
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3809,10 +3809,13 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
38093809
~VPScalarIVStepsRecipe() override = default;
38103810

38113811
VPScalarIVStepsRecipe *clone() override {
3812-
return new VPScalarIVStepsRecipe(
3812+
auto *NewR = new VPScalarIVStepsRecipe(
38133813
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
38143814
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
38153815
getDebugLoc());
3816+
if (getNumOperands() == 4)
3817+
NewR->addOperand(getOperand(3));
3818+
return NewR;
38163819
}
38173820

38183821
VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2388,22 +2388,28 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
23882388
EndLane = StartLane + 1;
23892389
}
23902390
Value *StartIdx0;
2391-
if (getNumOperands() == 3) {
2391+
if (getNumOperands() == 3)
23922392
StartIdx0 = getSignedIntOrFpConstant(BaseIVTy, 0);
2393-
} else
2393+
else
23942394
StartIdx0 = State.get(getOperand(3), true);
23952395

23962396
for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2397-
Value *StartIdx = Builder.CreateBinOp(
2398-
AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2397+
Value *StartIdx = StartIdx0;
2398+
if (Lane != 0) {
2399+
StartIdx = Builder.CreateBinOp(AddOp, StartIdx0,
2400+
getSignedIntOrFpConstant(BaseIVTy, Lane));
2401+
}
23992402
// The step returned by `createStepForVF` is a runtime-evaluated value
24002403
// when VF is scalable. Otherwise, it should be folded into a Constant.
24012404
assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
24022405
"Expected StartIdx to be folded to a constant when VF is not "
24032406
"scalable");
24042407
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
24052408
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2406-
State.set(this, Add, VPLane(Lane));
2409+
if (State.Lane)
2410+
State.set(this, Add, VPLane(Lane));
2411+
else
2412+
State.set(this, Add, VPLane(0));
24072413
}
24082414
}
24092415

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,7 +1457,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
14571457
}
14581458

14591459
// VPScalarIVSteps for part 0 can be replaced by their start value, if only
1460-
// the first lane is demanded.
1460+
// the first lane is demanded and both Lane and UnrollPart operands are 0.
14611461
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
14621462
if ((Steps->getNumOperands() == 3 ||
14631463
match(Steps->getOperand(3), m_ZeroInt())) &&
@@ -4436,9 +4436,9 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
44364436
for (VPBasicBlock *VPBB :
44374437
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
44384438
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4439-
if (!isa<VPReplicateRecipe, VPInstruction>(&R))
4439+
if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R))
44404440
continue;
4441-
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4441+
auto *DefR = cast<VPSingleDefRecipe>(&R);
44424442
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
44434443
VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
44444444
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,17 +126,18 @@ class UnrollState {
126126
};
127127
} // namespace
128128

129-
void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
130-
unsigned Part) {
129+
static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
130+
unsigned Part, VPlan &Plan,
131+
VPTypeAnalysis &TypeInfo) {
132+
Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0));
133+
Type *IntStepTy =
134+
IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
131135
if (Part == 0) {
132-
Steps->addOperand(getConstantInt(Part));
136+
Steps->addOperand(Plan.getConstantInt(IntStepTy, 0));
133137
return;
134138
}
135139

136140
VPBuilder Builder(Steps);
137-
Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0));
138-
Type *IntStepTy =
139-
IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
140141
VPValue *StartIdx0 = Steps->getOperand(2);
141142
StartIdx0 = Builder.createOverflowingOp(
142143
Instruction::Mul,
@@ -153,6 +154,11 @@ void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
153154
Steps->addOperand(StartIdx0);
154155
}
155156

157+
void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
158+
unsigned Part) {
159+
return ::addStartIndexForScalarSteps(Steps, Part, Plan, TypeInfo);
160+
}
161+
156162
void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
157163
VPBlockBase *InsertPt = VPR->getSingleSuccessor();
158164
for (unsigned Part = 1; Part != UF; ++Part) {
@@ -558,12 +564,28 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
558564
/*IsSingleScalar=*/true, /*Mask=*/nullptr,
559565
*RepR, *RepR, RepR->getDebugLoc());
560566
} else {
561-
assert(isa<VPInstruction>(DefR) &&
567+
assert((isa<VPInstruction, VPScalarIVStepsRecipe>(DefR)) &&
562568
"DefR must be a VPReplicateRecipe or VPInstruction");
563569
New = DefR->clone();
564570
for (const auto &[Idx, Op] : enumerate(NewOps)) {
565571
New->setOperand(Idx, Op);
566572
}
573+
if (isa<VPScalarIVStepsRecipe>(New)) {
574+
VPTypeAnalysis TypeInfo(Plan);
575+
if (New->getNumOperands() == 3)
576+
addStartIndexForScalarSteps(cast<VPScalarIVStepsRecipe>(New), 0, Plan,
577+
TypeInfo);
578+
579+
if (Lane.getKnownLane() != 0) {
580+
Type *BaseIVTy = TypeInfo.inferScalarType(DefR->getOperand(0));
581+
VPBuilder Builder(DefR);
582+
New->setOperand(
583+
3, Builder.createNaryOp(
584+
Instruction::Add,
585+
{New->getOperand(3),
586+
Plan.getConstantInt(BaseIVTy, Lane.getKnownLane())}));
587+
}
588+
}
567589
}
568590
New->insertBefore(DefR);
569591
return New;
@@ -590,14 +612,17 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
590612
SmallVector<VPRecipeBase *> ToRemove;
591613
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
592614
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
593-
if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
615+
if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) ||
594616
(isa<VPReplicateRecipe>(&R) &&
595617
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
596618
(isa<VPInstruction>(&R) &&
597619
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
598620
cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
599621
continue;
600622

623+
if (isa<VPScalarIVStepsRecipe>(&R) && Plan.hasScalarVFOnly())
624+
continue;
625+
601626
auto *DefR = cast<VPSingleDefRecipe>(&R);
602627
VPBuilder Builder(DefR);
603628
if (DefR->getNumUsers() == 0) {

llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,43 +16,42 @@ define void @low_trip_count_small(i32 %x, ptr %dst) {
1616
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
1717
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1818
; CHECK: [[VECTOR_BODY]]:
19-
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 0
2019
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[DST]], i64 1
2120
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST]], i64 2
2221
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 3
23-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
22+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[DST]], i32 0
2423
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[NEXT_GEP2]], i32 1
2524
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP2]], ptr [[NEXT_GEP3]], i32 2
2625
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP3]], ptr [[NEXT_GEP4]], i32 3
2726
; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
2827
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
2928
; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
3029
; CHECK: [[PRED_STORE_IF]]:
31-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1
30+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 1
3231
; CHECK-NEXT: store i8 0, ptr [[TMP7]], align 1
3332
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
3433
; CHECK: [[PRED_STORE_CONTINUE]]:
3534
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
36-
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
37-
; CHECK: [[PRED_STORE_IF5]]:
35+
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
36+
; CHECK: [[PRED_STORE_IF4]]:
3837
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 1
3938
; CHECK-NEXT: store i8 0, ptr [[TMP9]], align 1
40-
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
41-
; CHECK: [[PRED_STORE_CONTINUE6]]:
39+
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE5]]
40+
; CHECK: [[PRED_STORE_CONTINUE5]]:
4241
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
43-
; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
44-
; CHECK: [[PRED_STORE_IF7]]:
42+
; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
43+
; CHECK: [[PRED_STORE_IF6]]:
4544
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1
4645
; CHECK-NEXT: store i8 0, ptr [[TMP11]], align 1
47-
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
48-
; CHECK: [[PRED_STORE_CONTINUE8]]:
46+
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE7]]
47+
; CHECK: [[PRED_STORE_CONTINUE7]]:
4948
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
50-
; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
51-
; CHECK: [[PRED_STORE_IF9]]:
49+
; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
50+
; CHECK: [[PRED_STORE_IF8]]:
5251
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 1
5352
; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1
54-
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
55-
; CHECK: [[PRED_STORE_CONTINUE10]]:
53+
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]]
54+
; CHECK: [[PRED_STORE_CONTINUE9]]:
5655
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
5756
; CHECK: [[MIDDLE_BLOCK]]:
5857
; CHECK-NEXT: br label %[[EXIT:.*]]

llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,9 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
2020
; CHECK: vector.body:
2121
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2222
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
23-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
24-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
25-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP0]]
26-
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[TMP1]]
23+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
24+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[INDEX]]
25+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[TMP0]]
2726
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
2827
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
2928
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP4]] to i64

llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,31 +21,30 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
2121
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
2222
; CHECK: vector.body:
2323
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
24-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
25-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
26-
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
27-
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
24+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
25+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
26+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
27+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
2828
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
2929
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
3030
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP2]]
31-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP3]]
32-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
31+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META0:![0-9]+]]
3332
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
34-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1
35-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
36-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
3733
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1
38-
; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
39-
; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
40-
; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope [[META0]]
41-
; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
34+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1
35+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
36+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
37+
; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
38+
; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP8]], align 4, !alias.scope [[META0]]
39+
; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0]]
40+
; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
4241
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0
4342
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1
4443
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2
4544
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i24> [[TMP19]], i24 [[TMP16]], i32 3
4645
; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i24> [[TMP20]] to <4 x i32>
4746
; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]]
48-
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
47+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
4948
; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP23]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
5049
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
5150
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -174,26 +174,25 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
174174
; CHECK: [[VECTOR_BODY]]:
175175
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
176176
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
177-
; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
178-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
179-
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
180-
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
181-
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
177+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2
178+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 4
179+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6
180+
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[OFFSET_IDX]]
182181
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
183182
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
184183
; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
185184
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
186185
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
187186
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
188187
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
189-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
190-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
191-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
192-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
188+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[OFFSET_IDX]]
189+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP0]]
190+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
191+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
193192
; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2
194-
; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2
195-
; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2
196-
; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2
193+
; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP15]], align 2
194+
; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP16]], align 2
195+
; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP17]], align 2
197196
; CHECK-NEXT: store i64 0, ptr [[A]], align 8
198197
; CHECK-NEXT: store i64 0, ptr [[B]], align 8
199198
; CHECK-NEXT: store i64 0, ptr [[C]], align 8

0 commit comments

Comments
 (0)