Skip to content

Commit 5b05441

Browse files
fhahnaokblast
authored andcommitted
[VPlan] Run narrowInterleaveGroups during general VPlan optimizations. (llvm#149706)
Move narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: a) a new clone which contains all VFs of Plan, except VFToOptimize, and b) the original Plan with VFToOptimize as single VF. The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with llvm#149702, this allows to take the narrowed interleave groups into account when computing costs to choose the best VF and interleave count. One example where we currently miss interleaving/unrolling when narrowing interleave groups is https://godbolt.org/z/Yz77zbacz PR: llvm#149706
1 parent 890fbcd commit 5b05441

12 files changed

+381
-191
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7231,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72317231
return DenseMap<const SCEV *, Value *>();
72327232
}
72337233

7234-
VPlanTransforms::narrowInterleaveGroups(
7235-
BestVPlan, BestVF,
7236-
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
72377234
VPlanTransforms::removeDeadRecipes(BestVPlan);
72387235

72397236
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8202,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
82028199
if (CM.foldTailWithEVL())
82038200
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
82048201
*Plan, CM.getMaxSafeElements());
8202+
8203+
if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
8204+
VPlans.push_back(std::move(P));
8205+
82058206
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
82068207
VPlans.push_back(std::move(Plan));
82078208
}

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,6 +1213,7 @@ VPlan *VPlan::duplicate() {
12131213
}
12141214
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
12151215
Old2NewVPValues[&VF] = &NewPlan->VF;
1216+
Old2NewVPValues[&UF] = &NewPlan->UF;
12161217
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
12171218
if (BackedgeTakenCount) {
12181219
NewPlan->BackedgeTakenCount = new VPValue();

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4152,6 +4152,9 @@ class VPlan {
41524152
/// Represents the vectorization factor of the loop.
41534153
VPValue VF;
41544154

4155+
/// Represents the symbolic unroll factor of the loop.
4156+
VPValue UF;
4157+
41554158
/// Represents the loop-invariant VF * UF of the vector loop region.
41564159
VPValue VFxUF;
41574160

@@ -4305,6 +4308,9 @@ class VPlan {
43054308
VPValue &getVF() { return VF; };
43064309
const VPValue &getVF() const { return VF; };
43074310

4311+
/// Returns the symbolic UF of the vector loop region.
4312+
VPValue &getSymbolicUF() { return UF; };
4313+
43084314
/// Returns VF * UF of the vector loop region.
43094315
VPValue &getVFxUF() { return VFxUF; }
43104316

@@ -4314,6 +4320,12 @@ class VPlan {
43144320

43154321
void addVF(ElementCount VF) { VFs.insert(VF); }
43164322

4323+
/// Remove \p VF from the plan.
4324+
void removeVF(ElementCount VF) {
4325+
assert(hasVF(VF) && "tried to remove VF not present in plan");
4326+
VFs.remove(VF);
4327+
}
4328+
43174329
void setVF(ElementCount VF) {
43184330
assert(hasVF(VF) && "Cannot set VF not already in plan");
43194331
VFs.clear();

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 75 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3956,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
39563956
// used.
39573957
// TODO: Assert that they aren't used.
39583958

3959+
VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
3960+
Plan.getSymbolicUF().replaceAllUsesWith(UF);
3961+
39593962
// If there are no users of the runtime VF, compute VFxUF by constant folding
39603963
// the multiplication of VF and UF.
39613964
if (VF.getNumUsers() == 0) {
@@ -3975,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
39753978
}
39763979
VF.replaceAllUsesWith(RuntimeVF);
39773980

3978-
VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
39793981
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
39803982
VFxUF.replaceAllUsesWith(MulByUF);
39813983
}
@@ -4043,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
40434045
return false;
40444046
}
40454047

4046-
/// Returns true if \p IR is a full interleave group with factor and number of
4047-
/// members both equal to \p VF. The interleave group must also access the full
4048-
/// vector width \p VectorRegWidth.
4049-
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
4050-
unsigned VF, VPTypeAnalysis &TypeInfo,
4051-
unsigned VectorRegWidth) {
4048+
/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
4049+
/// number of members both equal to VF. The interleave group must also access
4050+
/// the full vector width.
4051+
static std::optional<ElementCount> isConsecutiveInterleaveGroup(
4052+
VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
4053+
VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
40524054
if (!InterleaveR)
4053-
return false;
4055+
return std::nullopt;
40544056

40554057
Type *GroupElementTy = nullptr;
40564058
if (InterleaveR->getStoredValues().empty()) {
@@ -4059,21 +4061,35 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
40594061
[&TypeInfo, GroupElementTy](VPValue *Op) {
40604062
return TypeInfo.inferScalarType(Op) == GroupElementTy;
40614063
}))
4062-
return false;
4064+
return std::nullopt;
40634065
} else {
40644066
GroupElementTy =
40654067
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
40664068
if (!all_of(InterleaveR->getStoredValues(),
40674069
[&TypeInfo, GroupElementTy](VPValue *Op) {
40684070
return TypeInfo.inferScalarType(Op) == GroupElementTy;
40694071
}))
4070-
return false;
4072+
return std::nullopt;
40714073
}
40724074

4073-
unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4074-
auto IG = InterleaveR->getInterleaveGroup();
4075-
return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4076-
GroupSize == VectorRegWidth;
4075+
auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
4076+
TypeSize Size = TTI.getRegisterBitWidth(
4077+
VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
4078+
: TargetTransformInfo::RGK_ScalableVector);
4079+
assert(Size.isScalable() == VF.isScalable() &&
4080+
"if Size is scalable, VF must to and vice versa");
4081+
return Size.getKnownMinValue();
4082+
};
4083+
4084+
for (ElementCount VF : VFs) {
4085+
unsigned MinVal = VF.getKnownMinValue();
4086+
unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
4087+
auto IG = InterleaveR->getInterleaveGroup();
4088+
if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
4089+
GroupSize == GetVectorWidthForVF(VF))
4090+
return {VF};
4091+
}
4092+
return std::nullopt;
40774093
}
40784094

40794095
/// Returns true if \p VPValue is a narrow VPValue.
@@ -4084,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
40844100
return RepR && RepR->isSingleScalar();
40854101
}
40864102

4087-
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
4088-
unsigned VectorRegWidth) {
4103+
std::unique_ptr<VPlan>
4104+
VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
4105+
const TargetTransformInfo &TTI) {
4106+
using namespace llvm::VPlanPatternMatch;
40894107
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4108+
40904109
if (!VectorLoop)
4091-
return;
4110+
return nullptr;
40924111

40934112
VPTypeAnalysis TypeInfo(Plan);
4094-
4095-
unsigned VFMinVal = VF.getKnownMinValue();
40964113
SmallVector<VPInterleaveRecipe *> StoreGroups;
4114+
std::optional<ElementCount> VFToOptimize;
40974115
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
40984116
if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
40994117
continue;
@@ -4107,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
41074125
// * recipes writing to memory except interleave groups
41084126
// Only support plans with a canonical induction phi.
41094127
if (R.isPhi())
4110-
return;
4128+
return nullptr;
41114129

41124130
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
41134131
if (R.mayWriteToMemory() && !InterleaveR)
4114-
return;
4115-
4116-
// Do not narrow interleave groups if there are VectorPointer recipes and
4117-
// the plan was unrolled. The recipe implicitly uses VF from
4118-
// VPTransformState.
4119-
// TODO: Remove restriction once the VF for the VectorPointer offset is
4120-
// modeled explicitly as operand.
4121-
if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4122-
return;
4132+
return nullptr;
41234133

41244134
// All other ops are allowed, but we reject uses that cannot be converted
41254135
// when checking all allowed consumers (store interleave groups) below.
41264136
if (!InterleaveR)
41274137
continue;
41284138

4129-
// Bail out on non-consecutive interleave groups.
4130-
if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4131-
VectorRegWidth))
4132-
return;
4133-
4139+
// Try to find a single VF, where all interleave groups are consecutive and
4140+
// saturate the full vector width. If we already have a candidate VF, check
4141+
// if it is applicable for the current InterleaveR, otherwise look for a
4142+
// suitable VF across the Plans VFs.
4143+
//
4144+
if (VFToOptimize) {
4145+
if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
4146+
TTI))
4147+
return nullptr;
4148+
} else {
4149+
if (auto VF = isConsecutiveInterleaveGroup(
4150+
InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
4151+
VFToOptimize = *VF;
4152+
else
4153+
return nullptr;
4154+
}
41344155
// Skip read interleave groups.
41354156
if (InterleaveR->getStoredValues().empty())
41364157
continue;
@@ -4164,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
41644185
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
41654186
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
41664187
if (!WideMember0)
4167-
return;
4188+
return nullptr;
41684189
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
41694190
auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
41704191
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
41714192
R->getNumOperands() > 2)
4172-
return;
4193+
return nullptr;
41734194
if (any_of(enumerate(R->operands()),
41744195
[WideMember0, Idx = I](const auto &P) {
41754196
const auto &[OpIdx, OpV] = P;
41764197
return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
41774198
}))
4178-
return;
4199+
return nullptr;
41794200
}
41804201
StoreGroups.push_back(InterleaveR);
41814202
}
41824203

41834204
if (StoreGroups.empty())
4184-
return;
4205+
return nullptr;
4206+
4207+
// All interleave groups in Plan can be narrowed for VFToOptimize. Split the
4208+
// original Plan into 2: a) a new clone which contains all VFs of Plan, except
4209+
// VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
4210+
std::unique_ptr<VPlan> NewPlan;
4211+
if (size(Plan.vectorFactors()) != 1) {
4212+
NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
4213+
Plan.setVF(*VFToOptimize);
4214+
NewPlan->removeVF(*VFToOptimize);
4215+
}
41854216

41864217
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
41874218
SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4252,9 +4283,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
42524283
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
42534284
VPBuilder PHBuilder(Plan.getVectorPreheader());
42544285

4255-
VPValue *UF = Plan.getOrAddLiveIn(
4256-
ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4257-
if (VF.isScalable()) {
4286+
VPValue *UF = &Plan.getSymbolicUF();
4287+
if (VFToOptimize->isScalable()) {
42584288
VPValue *VScale = PHBuilder.createElementCount(
42594289
CanIV->getScalarType(), ElementCount::getScalable(1));
42604290
VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4266,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
42664296
Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
42674297
}
42684298
removeDeadRecipes(Plan);
4299+
assert(none_of(*VectorLoop->getEntryBasicBlock(),
4300+
IsaPred<VPVectorPointerRecipe>) &&
4301+
"All VPVectorPointerRecipes should have been removed");
4302+
return NewPlan;
42694303
}
42704304

42714305
/// Add branch weight metadata, if the \p Plan's middle block is terminated by a

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -341,14 +341,20 @@ struct VPlanTransforms {
341341
static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
342342
ScalarEvolution &SE);
343343

344-
/// Try to convert a plan with interleave groups with VF elements to a plan
345-
/// with the interleave groups replaced by wide loads and stores processing VF
346-
/// elements, if all transformed interleave groups access the full vector
347-
/// width (checked via \o VectorRegWidth). This effectively is a very simple
348-
/// form of loop-aware SLP, where we use interleave groups to identify
349-
/// candidates.
350-
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
351-
unsigned VectorRegWidth);
344+
/// Try to find a single VF among \p Plan's VFs for which all interleave
345+
/// groups (with known minimum VF elements) can be replaced by wide loads and
346+
/// stores processing VF elements, if all transformed interleave groups access
347+
/// the full vector width (checked via the maximum vector register width). If
348+
/// the transformation can be applied, the original \p Plan will be split in
349+
/// 2:
350+
/// 1. The original Plan with the single VF containing the optimized recipes
351+
/// using wide loads instead of interleave groups.
352+
/// 2. A new clone which contains all VFs of Plan except the optimized VF.
353+
///
354+
/// This effectively is a very simple form of loop-aware SLP, where we use
355+
/// interleave groups to identify candidates.
356+
static std::unique_ptr<VPlan>
357+
narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
352358

353359
/// Predicate and linearize the control-flow in the only loop region of
354360
/// \p Plan. If \p FoldTail is true, create a mask guarding the loop

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
175175
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
176176
; CHECK: [[VECTOR_BODY]]:
177177
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
178-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
178+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
179179
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
180180
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
181-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
182-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
183-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
184-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
185-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
186-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
187-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
188-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
181+
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
182+
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
189183
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
190184
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
191185
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
192186
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
193-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
194-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
195-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
196-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
197-
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
198-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
199-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
187+
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
188+
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
189+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
200190
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
201191
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
202192
; CHECK: [[MIDDLE_BLOCK]]:
@@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
237227
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
238228
; CHECK: [[VECTOR_BODY]]:
239229
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
240-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2
230+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1
241231
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
242232
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
243-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
244-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
245-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
246-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
247-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
248-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
249-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
250-
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
233+
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
234+
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
251235
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
252236
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
253237
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
254238
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
255-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
256-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
257-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
258-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259-
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
260-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
261-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
239+
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
240+
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
241+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
262242
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
263243
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
264244
; CHECK: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)