-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[VPlan] Replicate VPScalarIVStepsRecipe by VF outside replicate regions. #170053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesExtend replicateByVF to also handle VPScalarIVStepsRecipe. To do so, the Depends on #169796 (included in Patch is 646.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170053.diff 77 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6ca750fc53279..294af92ee2496 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3788,7 +3788,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their scalar values.
class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
- public VPUnrollPartAccessor<3> {
+ public VPUnrollPartAccessor<4> {
Instruction::BinaryOps InductionOpcode;
public:
@@ -3812,10 +3812,13 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
~VPScalarIVStepsRecipe() override = default;
VPScalarIVStepsRecipe *clone() override {
- return new VPScalarIVStepsRecipe(
+ auto *NewR = new VPScalarIVStepsRecipe(
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
getDebugLoc());
+ if (getNumOperands() == 4)
+ NewR->addOperand(getOperand(3));
+ return NewR;
}
/// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0baf7172e4443..601c9c9063ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2367,22 +2367,22 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
// Compute the scalar steps and save the results in State.
Type *IntStepTy =
IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
- Type *VecIVTy = nullptr;
- Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!FirstLaneOnly && State.VF.isScalable()) {
- VecIVTy = VectorType::get(BaseIVTy, State.VF);
- UnitStepVec =
- Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
- SplatStep = Builder.CreateVectorSplat(State.VF, Step);
- SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
- }
unsigned StartLane = 0;
unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
if (State.Lane) {
StartLane = State.Lane->getKnownLane();
EndLane = StartLane + 1;
+ } else if (getNumOperands() == 5) {
+ // Operand 3 is the Lane operand (when present after replicating by VF).
+ VPValue *Op3 = getOperand(3);
+ assert(Op3->isLiveIn() && "lane operand must be a live-in");
+ auto *C = cast<ConstantInt>(Op3->getLiveInIRValue());
+ unsigned Val = C->getZExtValue();
+ StartLane = Val;
+ EndLane = Val + 1;
}
+
Value *StartIdx0;
if (getUnrollPart(*this) == 0)
StartIdx0 = ConstantInt::get(IntStepTy, 0);
@@ -2396,19 +2396,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
}
- if (!FirstLaneOnly && State.VF.isScalable()) {
- auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
- auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
- if (BaseIVTy->isFloatingPointTy())
- InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
- auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
- auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
- State.set(this, Add);
- // It's useful to record the lane values too for the known minimum number
- // of elements so we do those below. This improves the code quality when
- // trying to extract the first element, for example.
- }
-
if (BaseIVTy->isFloatingPointTy())
StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
@@ -2422,7 +2409,10 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
- State.set(this, Add, VPLane(Lane));
+ if (State.Lane)
+ State.set(this, Add, VPLane(Lane));
+ else
+ State.set(this, Add, VPLane(0));
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b12f8ccc73c7e..dcef3f54d5c2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -793,12 +793,17 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
WideIV->getDebugLoc(), Builder);
// Update scalar users of IV to use Step instead.
- if (!HasOnlyVectorVFs)
+ if (!HasOnlyVectorVFs) {
WideIV->replaceAllUsesWith(Steps);
- else
- WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
- return U.usesScalars(WideIV);
- });
+ } else {
+ bool HasScalableVF = Plan.hasScalableVF();
+ WideIV->replaceUsesWithIf(Steps,
+ [WideIV, HasScalableVF](VPUser &U, unsigned) {
+ if (HasScalableVF)
+ return U.usesFirstLaneOnly(WideIV);
+ return U.usesScalars(WideIV);
+ });
+ }
}
}
@@ -1432,9 +1437,14 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}
// VPScalarIVSteps for part 0 can be replaced by their start value, if only
- // the first lane is demanded.
+ // the first lane is demanded and both Lane and UnrollPart operands are 0.
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
- if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
+ bool LaneIsZero = Steps->getNumOperands() >= 4 &&
+ match(Steps->getOperand(3), m_ZeroInt());
+ bool PartIsZero =
+ Steps->getNumOperands() < 5 || match(Steps->getOperand(4), m_ZeroInt());
+ if (Steps->isPart0() && LaneIsZero && PartIsZero &&
+ vputils::onlyFirstLaneUsed(Steps)) {
Steps->replaceAllUsesWith(Steps->getOperand(0));
return;
}
@@ -4306,9 +4316,9 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
for (VPBasicBlock *VPBB :
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPReplicateRecipe, VPInstruction>(&R))
+ if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R))
continue;
- auto *DefR = cast<VPRecipeWithIRFlags>(&R);
+ auto *DefR = cast<VPSingleDefRecipe>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f215476b1e163..003686490d42c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -137,6 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
remapOperands(&PartIR, Part);
if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
+ ScalarIVSteps->addOperand(getConstantInt(0));
ScalarIVSteps->addOperand(getConstantInt(Part));
}
@@ -526,9 +527,21 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
/*IsSingleScalar=*/true, /*Mask=*/nullptr,
*RepR, *RepR, RepR->getDebugLoc());
} else {
- assert(isa<VPInstruction>(DefR) &&
+ assert((isa<VPInstruction, VPScalarIVStepsRecipe>(DefR)) &&
"DefR must be a VPReplicateRecipe or VPInstruction");
New = DefR->clone();
+ if (isa<VPScalarIVStepsRecipe>(New)) {
+ // Add or update lane operand for VPScalarIVStepsRecipe.
+ if (NewOps.size() == 3) {
+ NewOps.push_back(Plan.getConstantInt(IdxTy, 0));
+ New->addOperand(NewOps.back());
+ }
+ NewOps.push_back(Plan.getConstantInt(IdxTy, Lane.getKnownLane()));
+ New->addOperand(NewOps.back());
+ if (NewOps.size() == 5)
+ std::swap(NewOps[3], NewOps[4]);
+ }
+
for (const auto &[Idx, Op] : enumerate(NewOps)) {
New->setOperand(Idx, Op);
}
@@ -558,7 +571,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
+ if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) ||
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
@@ -566,6 +579,19 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
continue;
+ if (isa<VPScalarIVStepsRecipe>(&R) && Plan.hasScalarVFOnly()) {
+ // Add lane operand to VPScalarIVStepsRecipe only when the plan is
+ // scalar.
+ if (R.getNumOperands() == 4) {
+ R.addOperand(R.getOperand(3));
+ R.setOperand(3, Plan.getConstantInt(IdxTy, 0));
+ } else {
+ R.addOperand(Plan.getConstantInt(IdxTy, 0));
+ R.addOperand(Plan.getConstantInt(IdxTy, 0));
+ }
+ continue;
+ }
+
auto *DefR = cast<VPSingleDefRecipe>(&R);
VPBuilder Builder(DefR);
if (DefR->getNumUsers() == 0) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
index 14f5dd7d41691..46fc9646356c8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fold-tail-low-trip-count.ll
@@ -16,11 +16,10 @@ define void @low_trip_count_small(i32 %x, ptr %dst) {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 0
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[DST]], i64 1
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DST]], i64 2
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[DST]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[NEXT_GEP2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> [[TMP2]], ptr [[NEXT_GEP3]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP3]], ptr [[NEXT_GEP4]], i32 3
@@ -28,31 +27,31 @@ define void @low_trip_count_small(i32 %x, ptr %dst) {
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 1
; CHECK-NEXT: store i8 0, ptr [[TMP7]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
+; CHECK: [[PRED_STORE_IF4]]:
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 1
; CHECK-NEXT: store i8 0, ptr [[TMP9]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
-; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE5]]
+; CHECK: [[PRED_STORE_CONTINUE5]]:
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; CHECK: [[PRED_STORE_IF6]]:
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1
; CHECK-NEXT: store i8 0, ptr [[TMP11]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
-; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE7]]
+; CHECK: [[PRED_STORE_CONTINUE7]]:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; CHECK: [[PRED_STORE_IF8]]:
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 1
; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1
-; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
-; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]]
+; CHECK: [[PRED_STORE_CONTINUE9]]:
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index 26a9545764091..33be739be4718 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -20,10 +20,9 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OFFSET]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP4]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
index 2557ae55d2c85..b396d584a8497 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
@@ -21,31 +21,30 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP3]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META0:![0-9]+]]
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1
-; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope [[META0]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP8]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i24> [[TMP19]], i24 [[TMP16]], i32 3
; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i24> [[TMP20]] to <4 x i32>
; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP23]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 5b4bb70e6a479..2108c15b7f838 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -174,11 +174,10 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2
+; ...
[truncated]
|
Extend replicateByVF to also handle VPScalarIVStepsRecipe. To do so, the patch adds a new lane operand to VPScalarIVStepsRecipe, which is only added when replicating. This enables removing a number of lane 0 computations. The lane operand will also be used to explicitly replicate replicate regions in a follow-up. Depends on llvm#169796 (included in PR).
6c0f2fc to
a6d1ac1
Compare
This patch adds a new replicateReplicateRegionsByVF transform to unroll replicate=regions by VF, dissolving them. The transform creates VF copies of the replicate-region's content, connects them and converts recipes to single-scalar variants for the corresponding lanes. The initial version skips regions with live-outs (VPPredInstPHIRecipe), which will be added in follow-up patches. Depends on llvm#170053
Extend replicateByVF to also handle VPScalarIVStepsRecipe. To do so, the
patch adds a new lane operand to VPScalarIVStepsRecipe, which is only
added when replicating. This enables removing a number of lane 0
computations. The lane operand will also be used to explicitly replicate
replicate regions in a follow-up.
Depends on #169796 (included in
PR).