Skip to content

Commit 2a351ed

Browse files
committed
Move SCEV expansion so it's done after we know if we're tail-folding or not
1 parent f6b3fbb commit 2a351ed

File tree

4 files changed

+40
-29
lines changed

4 files changed

+40
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ class LoopVectorizationPlanner {
363363
/// loop iteration.
364364
std::optional<VectorizationFactor>
365365
plan(ElementCount UserVF, unsigned UserIC,
366-
SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
366+
std::optional<ArrayRef<PointerDiffInfo>> DiffChecks, std::function<Value*(const SCEV *)> Expander, bool &HasAliasMask);
367367

368368
/// Use the VPlan-native path to plan how to best vectorize, return the best
369369
/// VF and its cost.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6906,8 +6906,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
69066906

69076907
std::optional<VectorizationFactor>
69086908
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
6909-
SmallVector<PointerDiffInfoValues> RTChecks,
6910-
bool &HasAliasMask) {
6909+
std::optional<ArrayRef<PointerDiffInfo>> RTChecks,
6910+
std::function<Value*(const SCEV*)> Expander, bool &HasAliasMask) {
69116911
assert(OrigLoop->isInnermost() && "Inner loop expected.");
69126912
CM.collectValuesToIgnore();
69136913
CM.collectElementTypesForWidening();
@@ -6916,6 +6916,18 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
69166916
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
69176917
return std::nullopt;
69186918

6919+
// VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
6920+
// here and put them into a list.
6921+
SmallVector<PointerDiffInfoValues> DiffChecksValues;
6922+
if (RTChecks.has_value()
6923+
&& useActiveLaneMask(CM.getTailFoldingStyle(true))) {
6924+
for (auto Check : *RTChecks) {
6925+
Value *Sink = Expander(Check.SinkStart);
6926+
Value *Src = Expander(Check.SrcStart);
6927+
DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
6928+
}
6929+
}
6930+
69196931
// Invalidate interleave groups if all blocks of loop will be predicated.
69206932
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
69216933
!useMaskedInterleavedAccesses(TTI)) {
@@ -6944,7 +6956,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
69446956
CM.collectInLoopReductions();
69456957
if (CM.selectUserVectorizationFactor(UserVF)) {
69466958
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6947-
buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
6959+
buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecksValues, HasAliasMask);
69486960
if (!hasPlanWithVF(UserVF)) {
69496961
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
69506962
<< ".\n");
@@ -6979,9 +6991,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
69796991
}
69806992

69816993
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
6982-
RTChecks, HasAliasMask);
6994+
DiffChecksValues, HasAliasMask);
69836995
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
6984-
RTChecks, HasAliasMask);
6996+
DiffChecksValues, HasAliasMask);
69856997

69866998
LLVM_DEBUG(printPlans(dbgs()));
69876999
if (VPlans.empty())
@@ -9907,24 +9919,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99079919
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
99089920
AddBranchWeights);
99099921

9910-
// VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
9911-
// here and put them into a list.
9912-
std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
9913-
LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
9914-
SmallVector<PointerDiffInfoValues> DiffChecksValues;
9915-
if (DiffChecks.has_value() &&
9916-
useActiveLaneMask(CM.getTailFoldingStyle(true))) {
9917-
Instruction *Loc = L->getLoopPreheader()->getTerminator();
9918-
for (auto Check : *DiffChecks) {
9919-
Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
9920-
Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
9921-
DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
9922-
}
9923-
}
9924-
99259922
// Plan how to best vectorize, return the best VF and its cost.
9923+
auto Expand = [&Checks, &L](const SCEV *S) {
9924+
return Checks.expandCodeForMemCheck(S, L->getLoopPreheader()->getTerminator());
9925+
};
99269926
std::optional<VectorizationFactor> MaybeVF =
9927-
LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
9927+
LVP.plan(UserVF, UserIC, LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(), Expand, Checks.HasAliasMask);
99289928

99299929
VectorizationFactor VF = VectorizationFactor::Disabled();
99309930
unsigned IC = 1;

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
153153
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
154154
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
155155
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
156-
; PRED-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
156+
; PRED-NEXT: br label [[VECTOR_PH:%.*]]
157157
; PRED: vector.ph:
158158
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
159159
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
@@ -163,6 +163,13 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
163163
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
164164
; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
165165
; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
166+
; PRED-NEXT: [[SUB_DIFF:%.*]] = sub i64 [[SRC2]], [[DST1]]
167+
; PRED-NEXT: [[DIFF:%.*]] = sdiv i64 [[SUB_DIFF]], 1
168+
; PRED-NEXT: [[NEG_COMPARE:%.*]] = icmp slt i64 [[DIFF]], 0
169+
; PRED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[NEG_COMPARE]], i64 0
170+
; PRED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[DOTSPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
171+
; PRED-NEXT: [[PTR_DIFF_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[DIFF]])
172+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ALIAS:%.*]] = or <vscale x 8 x i1> [[PTR_DIFF_LANE_MASK]], [[DOTSPLAT]]
166173
; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
167174
; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
168175
; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
@@ -177,9 +184,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
177184
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
178185
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
179186
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
187+
; PRED-NEXT: [[TMP30:%.*]] = and <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK_ALIAS]]
180188
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
181189
; PRED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
182-
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
190+
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP30]], <vscale x 8 x i8> poison)
183191
; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
184192
; PRED-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
185193
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
@@ -188,16 +196,19 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
188196
; PRED-NEXT: [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
189197
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
190198
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
191-
; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
192-
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
199+
; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP30]])
200+
; PRED-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i1> [[ACTIVE_LANE_MASK_ALIAS]] to <vscale x 8 x i8>
201+
; PRED-NEXT: [[TMP32:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[TMP31]])
202+
; PRED-NEXT: [[TMP33:%.*]] = zext i8 [[TMP32]] to i64
203+
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP33]]
193204
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
194205
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
195206
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
196207
; PRED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
197208
; PRED: middle.block:
198209
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
199210
; PRED: scalar.ph:
200-
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
211+
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
201212
; PRED-NEXT: br label [[LOOP:%.*]]
202213
; PRED: loop:
203214
; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]

llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
99
; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %i.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
1010
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %add = add nuw nsw i32 %i.09, 1
1111
; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %data, i32 %add
12-
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i32, ptr %arrayidx, align 4
13-
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %add1 = add nsw i32 %0, 5
12+
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i32, ptr %arrayidx, align 4
13+
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %add1 = add nsw i32 %1, 5
1414
; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %i.09
1515
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add1, ptr %arrayidx2, align 4
1616
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i32 %add, %n

0 commit comments

Comments
 (0)